package wikipediadl import ( "compress/bzip2" "encoding/xml" "errors" "io" "log" "net/http" ) type WikiArticle struct { Title string `xml:"title"` Revision Revision `xml:"revision"` } type Revision struct { Text string `xml:"text"` } func ExtractArticles(bundle string) ([]WikiArticle, error) { url := WikipediaDumpUrl + bundle resp, err := http.Get(url) if err != nil { log.Println(err.Error()) return nil, errors.New("wikipediadl: failed load articles") } defer resp.Body.Close() if resp.StatusCode != 200 { return nil, errors.New("wikipediadl: bad response status") } bz2Reader := bzip2.NewReader(resp.Body) xmlDec := xml.NewDecoder(bz2Reader) count := 0 articles := []WikiArticle{} Loop: for { tok, err := xmlDec.Token() if err != nil { if err == io.EOF { break } return nil, errors.New("XML token error") } switch se := tok.(type) { case xml.StartElement: if count == 2 { // XXX: remove later break Loop } if se.Name.Local != "page" { continue } var p WikiArticle if err := xmlDec.DecodeElement(&p, &se); err != nil { log.Println(err.Error()) continue } articles = append(articles, p) count++ // XXX: remove later } } return articles, nil }