package wikipediadl import ( "compress/bzip2" "encoding/xml" "flag" "fmt" "io" "log" "net/http" ) type Page struct { Title string `xml:"title"` Revision Revision `xml:"revision"` } type Revision struct { Text string `xml:"text"` } func DownloadArticles() { url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2" limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all") flag.Parse() log.Printf("Downloading chunk: %s", url) resp, err := http.Get(url) if err != nil { log.Fatalf("Failed to download chunk: %v", err) } defer resp.Body.Close() if resp.StatusCode != 200 { body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body)) } bz2Reader := bzip2.NewReader(resp.Body) dec := xml.NewDecoder(bz2Reader) count := 0 for { tok, err := dec.Token() if err != nil { if err == io.EOF { log.Println("Reached end of chunk") break } log.Fatalf("XML token error: %v", err) } switch se := tok.(type) { case xml.StartElement: if se.Name.Local == "page" { var p Page if err := dec.DecodeElement(&p, &se); err != nil { log.Printf("Error decoding page: %v", err) continue } count++ fmt.Printf("---- Article %d ----\n", count) fmt.Printf("Title: %s\n", p.Title) // fmt.Println("Content:") // fmt.Println(p.Revision.Text) fmt.Println("--------------------\n") if *limiter > 0 && count >= *limiter { log.Printf("Reached limit of %d articles, stopping.", *limiter) return } } } } log.Printf("Done. Total articles processed: %d", count) }