From f542f01b49b7b345d1552d64f8036234d177bd16 Mon Sep 17 00:00:00 2001 From: Oliwier Adamczyk Date: Sat, 4 Oct 2025 18:19:02 +0200 Subject: [PATCH] Added: fetching polish articles --- .gitignore | 1 + cmd/serve/main.go | 7 ++ config.json | 3 +- example.db | Bin 8192 -> 0 bytes go.mod | 23 ++++- go.sum | 115 +++++++++++++++++++++++ internal/config/setup.go | 3 +- internal/db/setup.go | 4 +- internal/wikipediadl/downloadarticles.go | 79 ++++++++++++++++ internal/wikipediadl/error.go | 7 ++ internal/wikipediadl/fetcharticles.go | 71 ++++++++++++++ 11 files changed, 309 insertions(+), 4 deletions(-) create mode 100644 .gitignore delete mode 100644 example.db create mode 100644 internal/wikipediadl/downloadarticles.go create mode 100644 internal/wikipediadl/error.go create mode 100644 internal/wikipediadl/fetcharticles.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3997bea --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.db \ No newline at end of file diff --git a/cmd/serve/main.go b/cmd/serve/main.go index 66df405..f07a465 100644 --- a/cmd/serve/main.go +++ b/cmd/serve/main.go @@ -1,8 +1,10 @@ package main import ( + "log" "scrap/internal/config" "scrap/internal/db" + "scrap/internal/wikipediadl" ) func main() { @@ -10,4 +12,9 @@ func main() { db.Setup() defer db.Close() + + if err := wikipediadl.FetchArticles(); err != nil { + log.Println(err.Error()) + } + } diff --git a/config.json b/config.json index e210c76..e1c9923 100644 --- a/config.json +++ b/config.json @@ -1,3 +1,4 @@ { - "sql-tables-dir": "./sqltable/" + "sql-tables-dir": "./sqltable/", + "sql-database-name": "scrap.db" } \ No newline at end of file diff --git a/example.db b/example.db deleted file mode 100644 index dd49cbdba3b2b043cac7599ed4541908a204e90a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8192 zcmeI#u?oU45C-6+2tq-05S$7(RuBhEovkI{(8U_WwblrQ+G0%~z*X>_O^cb`<^RcX z+>z{`ew@g#OCX92oA!3YiQ;sQ{ZXVdd^lttY&X%v~$o%5tH!}}`00bZa z0SG_<0uX=z1Rwwb2teS@hcnOP&roOQQe}-U@=`r)cj8n;Qjm 0 && count >= *limiter { + log.Printf("Reached limit of %d articles, stopping.", *limiter) + return + } + } + } + } + + log.Printf("Done. Total articles processed: %d", count) +} diff --git a/internal/wikipediadl/error.go b/internal/wikipediadl/error.go new file mode 100644 index 0000000..755fe66 --- /dev/null +++ b/internal/wikipediadl/error.go @@ -0,0 +1,7 @@ +package wikipediadl + +import "errors" + +var ( + ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles") +) diff --git a/internal/wikipediadl/fetcharticles.go b/internal/wikipediadl/fetcharticles.go new file mode 100644 index 0000000..c1cac77 --- /dev/null +++ b/internal/wikipediadl/fetcharticles.go @@ -0,0 +1,71 @@ +package wikipediadl + +import ( + "fmt" + "log" + "strings" + + "github.com/gocolly/colly" +) + +const ( + DumpDomain = "dumps.wikimedia.org" + DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/" +) + +func getScraper() *colly.Collector { + s := colly.NewCollector( + colly.AllowedDomains(DumpDomain), + ) + + return s +} + +func getAllArticles(s *colly.Collector) []string { + articles := []string{} + + s.OnHTML("a", func(h *colly.HTMLElement) { + article := h.Attr("href") + if !isValidArticle(article) { + return + } + + articles = append(articles, h.Attr("href")) + + }) + + s.OnError(func(r *colly.Response, err error) { + log.Println(r.Request.URL) + }) + + s.Visit(DumpUrl) + + return articles +} + +func isValidArticle(a string) bool { + const ( + validPrefix = "plwiki-latest-pages-articles" + validSuffix = ".bz2" + ) + + if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) { + return false + } + + article, _ := strings.CutPrefix(a, validPrefix) + + articleIndex := article[0] + return articleIndex >= 48 && articleIndex <= 57 +} + +func FetchArticles() error { + scraper := getScraper() + + articles := getAllArticles(scraper) + for _, a := range articles { + fmt.Println(a) + } + + return nil +}