Added: fetching polish articles

This commit is contained in:
Oliwier Adamczyk
2025-10-04 18:19:02 +02:00
parent fdd1c98e75
commit f542f01b49
11 changed files with 309 additions and 4 deletions

View File

@@ -0,0 +1,79 @@
package wikipediadl
import (
"compress/bzip2"
"encoding/xml"
"flag"
"fmt"
"io"
"log"
"net/http"
)
type Page struct {
Title string `xml:"title"`
Revision Revision `xml:"revision"`
}
type Revision struct {
Text string `xml:"text"`
}
func DownloadArticles() {
url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2"
limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all")
flag.Parse()
log.Printf("Downloading chunk: %s", url)
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Failed to download chunk: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body))
}
bz2Reader := bzip2.NewReader(resp.Body)
dec := xml.NewDecoder(bz2Reader)
count := 0
for {
tok, err := dec.Token()
if err != nil {
if err == io.EOF {
log.Println("Reached end of chunk")
break
}
log.Fatalf("XML token error: %v", err)
}
switch se := tok.(type) {
case xml.StartElement:
if se.Name.Local == "page" {
var p Page
if err := dec.DecodeElement(&p, &se); err != nil {
log.Printf("Error decoding page: %v", err)
continue
}
count++
fmt.Printf("---- Article %d ----\n", count)
fmt.Printf("Title: %s\n", p.Title)
// fmt.Println("Content:")
// fmt.Println(p.Revision.Text)
fmt.Println("--------------------\n")
if *limiter > 0 && count >= *limiter {
log.Printf("Reached limit of %d articles, stopping.", *limiter)
return
}
}
}
}
log.Printf("Done. Total articles processed: %d", count)
}

View File

@@ -0,0 +1,7 @@
package wikipediadl
import "errors"
var (
ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles")
)

View File

@@ -0,0 +1,71 @@
package wikipediadl
import (
"fmt"
"log"
"strings"
"github.com/gocolly/colly"
)
const (
DumpDomain = "dumps.wikimedia.org"
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
)
func getScraper() *colly.Collector {
s := colly.NewCollector(
colly.AllowedDomains(DumpDomain),
)
return s
}
func getAllArticles(s *colly.Collector) []string {
articles := []string{}
s.OnHTML("a", func(h *colly.HTMLElement) {
article := h.Attr("href")
if !isValidArticle(article) {
return
}
articles = append(articles, h.Attr("href"))
})
s.OnError(func(r *colly.Response, err error) {
log.Println(r.Request.URL)
})
s.Visit(DumpUrl)
return articles
}
func isValidArticle(a string) bool {
const (
validPrefix = "plwiki-latest-pages-articles"
validSuffix = ".bz2"
)
if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) {
return false
}
article, _ := strings.CutPrefix(a, validPrefix)
articleIndex := article[0]
return articleIndex >= 48 && articleIndex <= 57
}
func FetchArticles() error {
scraper := getScraper()
articles := getAllArticles(scraper)
for _, a := range articles {
fmt.Println(a)
}
return nil
}