package wikipediadl import ( "log" "strings" "github.com/gocolly/colly" ) const ( DumpDomain = "dumps.wikimedia.org" DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/" ) func FetchArticleBundles() ([]string, error) { scraper := getScraper() articles := getAllArticles(scraper) return articles, nil } func getScraper() *colly.Collector { return colly.NewCollector( colly.AllowedDomains(DumpDomain), ) } func getAllArticles(s *colly.Collector) []string { articles := []string{} s.OnHTML("a", func(h *colly.HTMLElement) { article := h.Attr("href") if !isValidArticle(article) { return } articles = append(articles, h.Attr("href")) }) s.OnError(func(r *colly.Response, err error) { log.Println(r.Request.URL) }) s.Visit(DumpUrl) return articles } func isValidArticle(a string) bool { const ( validPrefix = "plwiki-latest-pages-articles" validSuffix = ".bz2" ) if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) { return false } article, _ := strings.CutPrefix(a, validPrefix) articleIndex := article[0] return articleIndex >= 48 && articleIndex <= 57 }