72 lines
1.2 KiB
Go
72 lines
1.2 KiB
Go
package wikipediadl
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"strings"
|
|
|
|
"github.com/gocolly/colly"
|
|
)
|
|
|
|
const (
|
|
DumpDomain = "dumps.wikimedia.org"
|
|
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
|
|
)
|
|
|
|
func getScraper() *colly.Collector {
|
|
s := colly.NewCollector(
|
|
colly.AllowedDomains(DumpDomain),
|
|
)
|
|
|
|
return s
|
|
}
|
|
|
|
func getAllArticles(s *colly.Collector) []string {
|
|
articles := []string{}
|
|
|
|
s.OnHTML("a", func(h *colly.HTMLElement) {
|
|
article := h.Attr("href")
|
|
if !isValidArticle(article) {
|
|
return
|
|
}
|
|
|
|
articles = append(articles, h.Attr("href"))
|
|
|
|
})
|
|
|
|
s.OnError(func(r *colly.Response, err error) {
|
|
log.Println(r.Request.URL)
|
|
})
|
|
|
|
s.Visit(DumpUrl)
|
|
|
|
return articles
|
|
}
|
|
|
|
func isValidArticle(a string) bool {
|
|
const (
|
|
validPrefix = "plwiki-latest-pages-articles"
|
|
validSuffix = ".bz2"
|
|
)
|
|
|
|
if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) {
|
|
return false
|
|
}
|
|
|
|
article, _ := strings.CutPrefix(a, validPrefix)
|
|
|
|
articleIndex := article[0]
|
|
return articleIndex >= 48 && articleIndex <= 57
|
|
}
|
|
|
|
func FetchArticles() error {
|
|
scraper := getScraper()
|
|
|
|
articles := getAllArticles(scraper)
|
|
for _, a := range articles {
|
|
fmt.Println(a)
|
|
}
|
|
|
|
return nil
|
|
}
|