80 lines
1.7 KiB
Go
80 lines
1.7 KiB
Go
package wikipediadl
|
|
|
|
import (
|
|
"compress/bzip2"
|
|
"encoding/xml"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
)
|
|
|
|
type Page struct {
|
|
Title string `xml:"title"`
|
|
Revision Revision `xml:"revision"`
|
|
}
|
|
|
|
type Revision struct {
|
|
Text string `xml:"text"`
|
|
}
|
|
|
|
func DownloadArticles() {
|
|
url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2"
|
|
|
|
limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all")
|
|
flag.Parse()
|
|
|
|
log.Printf("Downloading chunk: %s", url)
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
log.Fatalf("Failed to download chunk: %v", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != 200 {
|
|
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
|
|
log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body))
|
|
}
|
|
|
|
bz2Reader := bzip2.NewReader(resp.Body)
|
|
dec := xml.NewDecoder(bz2Reader)
|
|
|
|
count := 0
|
|
for {
|
|
tok, err := dec.Token()
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
log.Println("Reached end of chunk")
|
|
break
|
|
}
|
|
log.Fatalf("XML token error: %v", err)
|
|
}
|
|
|
|
switch se := tok.(type) {
|
|
case xml.StartElement:
|
|
if se.Name.Local == "page" {
|
|
var p Page
|
|
if err := dec.DecodeElement(&p, &se); err != nil {
|
|
log.Printf("Error decoding page: %v", err)
|
|
continue
|
|
}
|
|
|
|
count++
|
|
fmt.Printf("---- Article %d ----\n", count)
|
|
fmt.Printf("Title: %s\n", p.Title)
|
|
// fmt.Println("Content:")
|
|
// fmt.Println(p.Revision.Text)
|
|
fmt.Println("--------------------\n")
|
|
|
|
if *limiter > 0 && count >= *limiter {
|
|
log.Printf("Reached limit of %d articles, stopping.", *limiter)
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
log.Printf("Done. Total articles processed: %d", count)
|
|
}
|