76 lines
1.2 KiB
Go
76 lines
1.2 KiB
Go
package wikipediadl
|
|
|
|
import (
|
|
"compress/bzip2"
|
|
"encoding/xml"
|
|
"errors"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
)
|
|
|
|
type WikiArticle struct {
|
|
Title string `xml:"title"`
|
|
Revision Revision `xml:"revision"`
|
|
}
|
|
|
|
type Revision struct {
|
|
Text string `xml:"text"`
|
|
}
|
|
|
|
func ExtractArticles(bundle string) ([]WikiArticle, error) {
|
|
url := WikipediaDumpUrl + bundle
|
|
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
log.Println(err.Error())
|
|
return nil, errors.New("wikipediadl: failed load articles")
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
return nil, errors.New("wikipediadl: bad response status")
|
|
}
|
|
|
|
bz2Reader := bzip2.NewReader(resp.Body)
|
|
xmlDec := xml.NewDecoder(bz2Reader)
|
|
|
|
count := 0
|
|
|
|
articles := []WikiArticle{}
|
|
Loop:
|
|
for {
|
|
tok, err := xmlDec.Token()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
|
|
return nil, errors.New("XML token error")
|
|
}
|
|
|
|
switch se := tok.(type) {
|
|
case xml.StartElement:
|
|
if count == 2 { // XXX: remove later
|
|
break Loop
|
|
}
|
|
|
|
if se.Name.Local != "page" {
|
|
continue
|
|
}
|
|
|
|
var p WikiArticle
|
|
if err := xmlDec.DecodeElement(&p, &se); err != nil {
|
|
log.Println(err.Error())
|
|
continue
|
|
}
|
|
|
|
articles = append(articles, p)
|
|
count++ // XXX: remove later
|
|
}
|
|
}
|
|
|
|
return articles, nil
|
|
}
|