Added: downloading and returning wikipedia articles

This commit is contained in:
Oliwier Adamczyk
2025-10-04 23:14:41 +02:00
parent f542f01b49
commit 6df63dc4c1
26 changed files with 636 additions and 100 deletions

View File

@@ -0,0 +1,75 @@
package wikipediadl
import (
"compress/bzip2"
"encoding/xml"
"errors"
"io"
"log"
"net/http"
)
type WikiArticle struct {
Title string `xml:"title"`
Revision Revision `xml:"revision"`
}
type Revision struct {
Text string `xml:"text"`
}
func ExtractArticles(bundle string) ([]WikiArticle, error) {
url := WikipediaDumpUrl + bundle
resp, err := http.Get(url)
if err != nil {
log.Println(err.Error())
return nil, errors.New("wikipediadl: failed load articles")
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, errors.New("wikipediadl: bad response status")
}
bz2Reader := bzip2.NewReader(resp.Body)
xmlDec := xml.NewDecoder(bz2Reader)
count := 0
articles := []WikiArticle{}
Loop:
for {
tok, err := xmlDec.Token()
if err != nil {
if err == io.EOF {
break
}
return nil, errors.New("XML token error")
}
switch se := tok.(type) {
case xml.StartElement:
if count == 2 { // XXX: remove later
break Loop
}
if se.Name.Local != "page" {
continue
}
var p WikiArticle
if err := xmlDec.DecodeElement(&p, &se); err != nil {
log.Println(err.Error())
continue
}
articles = append(articles, p)
count++ // XXX: remove later
}
}
return articles, nil
}