Added: fetching polish articles
This commit is contained in:
79
internal/wikipediadl/downloadarticles.go
Normal file
79
internal/wikipediadl/downloadarticles.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package wikipediadl
|
||||
|
||||
import (
|
||||
"compress/bzip2"
|
||||
"encoding/xml"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type Page struct {
|
||||
Title string `xml:"title"`
|
||||
Revision Revision `xml:"revision"`
|
||||
}
|
||||
|
||||
type Revision struct {
|
||||
Text string `xml:"text"`
|
||||
}
|
||||
|
||||
func DownloadArticles() {
|
||||
url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2"
|
||||
|
||||
limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all")
|
||||
flag.Parse()
|
||||
|
||||
log.Printf("Downloading chunk: %s", url)
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to download chunk: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
|
||||
log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body))
|
||||
}
|
||||
|
||||
bz2Reader := bzip2.NewReader(resp.Body)
|
||||
dec := xml.NewDecoder(bz2Reader)
|
||||
|
||||
count := 0
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
log.Println("Reached end of chunk")
|
||||
break
|
||||
}
|
||||
log.Fatalf("XML token error: %v", err)
|
||||
}
|
||||
|
||||
switch se := tok.(type) {
|
||||
case xml.StartElement:
|
||||
if se.Name.Local == "page" {
|
||||
var p Page
|
||||
if err := dec.DecodeElement(&p, &se); err != nil {
|
||||
log.Printf("Error decoding page: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
count++
|
||||
fmt.Printf("---- Article %d ----\n", count)
|
||||
fmt.Printf("Title: %s\n", p.Title)
|
||||
// fmt.Println("Content:")
|
||||
// fmt.Println(p.Revision.Text)
|
||||
fmt.Println("--------------------\n")
|
||||
|
||||
if *limiter > 0 && count >= *limiter {
|
||||
log.Printf("Reached limit of %d articles, stopping.", *limiter)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("Done. Total articles processed: %d", count)
|
||||
}
|
||||
7
internal/wikipediadl/error.go
Normal file
7
internal/wikipediadl/error.go
Normal file
@@ -0,0 +1,7 @@
|
||||
package wikipediadl
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles")
|
||||
)
|
||||
71
internal/wikipediadl/fetcharticles.go
Normal file
71
internal/wikipediadl/fetcharticles.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package wikipediadl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
"github.com/gocolly/colly"
|
||||
)
|
||||
|
||||
const (
|
||||
DumpDomain = "dumps.wikimedia.org"
|
||||
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
|
||||
)
|
||||
|
||||
func getScraper() *colly.Collector {
|
||||
s := colly.NewCollector(
|
||||
colly.AllowedDomains(DumpDomain),
|
||||
)
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func getAllArticles(s *colly.Collector) []string {
|
||||
articles := []string{}
|
||||
|
||||
s.OnHTML("a", func(h *colly.HTMLElement) {
|
||||
article := h.Attr("href")
|
||||
if !isValidArticle(article) {
|
||||
return
|
||||
}
|
||||
|
||||
articles = append(articles, h.Attr("href"))
|
||||
|
||||
})
|
||||
|
||||
s.OnError(func(r *colly.Response, err error) {
|
||||
log.Println(r.Request.URL)
|
||||
})
|
||||
|
||||
s.Visit(DumpUrl)
|
||||
|
||||
return articles
|
||||
}
|
||||
|
||||
func isValidArticle(a string) bool {
|
||||
const (
|
||||
validPrefix = "plwiki-latest-pages-articles"
|
||||
validSuffix = ".bz2"
|
||||
)
|
||||
|
||||
if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) {
|
||||
return false
|
||||
}
|
||||
|
||||
article, _ := strings.CutPrefix(a, validPrefix)
|
||||
|
||||
articleIndex := article[0]
|
||||
return articleIndex >= 48 && articleIndex <= 57
|
||||
}
|
||||
|
||||
func FetchArticles() error {
|
||||
scraper := getScraper()
|
||||
|
||||
articles := getAllArticles(scraper)
|
||||
for _, a := range articles {
|
||||
fmt.Println(a)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user