げっちゅな屋でスクレイピング2
何
以前のエントリで書いたのは月別の発売日を出力するものでしたが、今回は検索を行うものを書きました。基本は一緒で、URL、クエリ、matcherなどが変わりました。要は別サイトをスクレイピングした感じになりました。
詰まった所
- クエリをEucJPにするのを忘れてしばらく時間を使った。
- 発売日に特徴的なタグがついていない(諦めて該当箇所全部出すことにした)。
code
package main import ( "flag" "fmt" "net/http" "strings" "io/ioutil" "net/http/cookiejar" "net/url" "github.com/yhat/scrape" "golang.org/x/net/html" "golang.org/x/net/html/atom" "golang.org/x/text/encoding/japanese" "golang.org/x/text/transform" ) var ( keyword string title string brand string person string isbn string jan string age string genre string startDate string endDate string sort string sort2 string listCount string listType string search string ) func eucjpToUtf8(str string) (string, error) { ret, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(str), japanese.EUCJP.NewDecoder())) if err != nil { return "", err } return string(ret), err } func utf8ToEucjp(str string) (string, error) { ret, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(str), japanese.EUCJP.NewEncoder())) if err != nil { return "", err } return string(ret), err } func main() { flag.StringVar(&keyword, "keyword", "", "search keyword") flag.StringVar(&keyword, "k", "", "search keyword short") flag.StringVar(&title, "title", "", "search title") flag.StringVar(&title, "t", "", "search title short") flag.StringVar(&brand, "brand", "", "search brand") flag.StringVar(&brand, "b", "", "search brand short") flag.StringVar(&person, "person", "", "search person") flag.StringVar(&person, "p", "", "search person short") flag.StringVar(&isbn, "isbn", "", "search isbn") flag.StringVar(&isbn, "i", "", "search isbn short") flag.StringVar(&jan, "jan", "", "search jan") flag.StringVar(&jan, "j", "", "search jan short") flag.StringVar(&age, "age", "", "age") flag.StringVar(&age, "a", "", "age short") flag.StringVar(&genre, "genre", "pc_soft", "genre") flag.StringVar(&genre, "g", "pc_soft", "genre default pc_soft") flag.StringVar(&startDate, "startdate", "", "start date") flag.StringVar(&startDate, "sd", "", "start date short") flag.StringVar(&endDate, "enddate", "", "end date") flag.StringVar(&endDate, "ed", "", "end date short") flag.StringVar(&listCount, "listcount", "100", "list") flag.StringVar(&listCount, "lc", "100", "list short default 100") flag.StringVar(&listType, "listtype", "list", "list type default list") flag.StringVar(&listType, "lt", "list", "list type short") flag.StringVar(&sort, "sort", "release_date", "sort default release_date") flag.StringVar(&sort2, "sort2", "down", "sort2 default down") flag.StringVar(&search, "search", "search", "search default search") flag.StringVar(&search, "s", "search", "search short") flag.Parse() getchuURL := "http://www.getchu.com/php/nsearch.phtml" jar, _ := cookiejar.New(nil) var cookies []*http.Cookie cookie := &http.Cookie{ Name: "getchu_adalt_flag", Value: "getchu.com", Path: "/", Domain: "www.getchu.com", } cookies = append(cookies, cookie) u, _ := url.Parse(getchuURL) jar.SetCookies(u, cookies) client := &http.Client{ Jar: jar, } brand, _ = utf8ToEucjp(brand) title, _ = utf8ToEucjp(title) keyword, _ = utf8ToEucjp(keyword) person, _ = utf8ToEucjp(person) values := url.Values{} values.Set("search_keyword", keyword) values.Add("search_title", title) values.Add("search_brand", brand) values.Add("search_person", person) values.Add("search_isbn", isbn) values.Add("search_jan", jan) values.Add("start_date", startDate) values.Add("end_date", endDate) values.Add("sort", sort) values.Add("sort2", sort2) values.Add("list_count", listCount) values.Add("list_type", listType) values.Add("age", age) values.Add("genre", genre) values.Add("search", search) valuesEncoded := values.Encode() req, _ := http.NewRequest("POST", getchuURL, strings.NewReader(valuesEncoded)) req.Header.Add("Content-Type", "application/x-www-form-urlencoded") resp, err := client.Do(req) if err != nil { panic(err) } defer resp.Body.Close() root, err := html.Parse(resp.Body) if err != nil { panic(err) } req.Header.Add("Content-Type", "application/x-www-form-urlencoded") matcherTitle := func(n *html.Node) bool { if n.DataAtom == atom.Div { return scrape.Attr(n, "class") == "content_block" } return false } titles := scrape.FindAll(root, matcherTitle) for i, title := range titles { title, err := eucjpToUtf8(scrape.Text(title)) if err != nil { panic(err) } fmt.Printf("%d: %s\n", i, title) } }