Files
scrap/internal/wikipediadl/fetcharticles.go
2025-10-04 23:14:41 +02:00

65 lines
1.1 KiB
Go

package wikipediadl
import (
"log"
"strings"
"github.com/gocolly/colly"
)
const (
DumpDomain = "dumps.wikimedia.org"
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
)
func FetchArticleBundles() ([]string, error) {
scraper := getScraper()
articles := getAllArticles(scraper)
return articles, nil
}
func getScraper() *colly.Collector {
return colly.NewCollector(
colly.AllowedDomains(DumpDomain),
)
}
func getAllArticles(s *colly.Collector) []string {
articles := []string{}
s.OnHTML("a", func(h *colly.HTMLElement) {
article := h.Attr("href")
if !isValidArticle(article) {
return
}
articles = append(articles, h.Attr("href"))
})
s.OnError(func(r *colly.Response, err error) {
log.Println(r.Request.URL)
})
s.Visit(DumpUrl)
return articles
}
func isValidArticle(a string) bool {
const (
validPrefix = "plwiki-latest-pages-articles"
validSuffix = ".bz2"
)
if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) {
return false
}
article, _ := strings.CutPrefix(a, validPrefix)
articleIndex := article[0]
return articleIndex >= 48 && articleIndex <= 57
}