From fdd1c98e75e55d77d0b2c8e1db2e7862d2f7af4d Mon Sep 17 00:00:00 2001 From: Oliwier Adamczyk Date: Sat, 4 Oct 2025 15:11:20 +0200 Subject: [PATCH 1/5] Added: creating database, sql tx repository --- cmd/serve/main.go | 10 +++++- config.json | 3 ++ example.db | Bin 0 -> 8192 bytes go.mod | 2 ++ go.sum | 2 ++ internal/config/setup.go | 25 ++++++++++++++ internal/db/irepository.go | 14 ++++++++ internal/db/repository.go | 34 +++++++++++++++++++ internal/db/setup.go | 65 +++++++++++++++++++++++++++++++++++++ sqltable/1_articles.sql | 5 +++ 10 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 config.json create mode 100644 example.db create mode 100644 go.sum create mode 100644 internal/config/setup.go create mode 100644 internal/db/irepository.go create mode 100644 internal/db/repository.go create mode 100644 internal/db/setup.go create mode 100644 sqltable/1_articles.sql diff --git a/cmd/serve/main.go b/cmd/serve/main.go index c4e96f5..66df405 100644 --- a/cmd/serve/main.go +++ b/cmd/serve/main.go @@ -1,5 +1,13 @@ -package serve +package main + +import ( + "scrap/internal/config" + "scrap/internal/db" +) func main() { + config.Setup() + db.Setup() + defer db.Close() } diff --git a/config.json b/config.json new file mode 100644 index 0000000..e210c76 --- /dev/null +++ b/config.json @@ -0,0 +1,3 @@ +{ + "sql-tables-dir": "./sqltable/" +} \ No newline at end of file diff --git a/example.db b/example.db new file mode 100644 index 0000000000000000000000000000000000000000..dd49cbdba3b2b043cac7599ed4541908a204e90a GIT binary patch literal 8192 zcmeI#u?oU45C-6+2tq-05S$7(RuBhEovkI{(8U_WwblrQ+G0%~z*X>_O^cb`<^RcX z+>z{`ew@g#OCX92oA!3YiQ;sQ{ZXVdd^lttY&X%v~$o%5tH!}}`00bZa z0SG_<0uX=z1Rwwb2teS@hcnOP&roOQQe}-U@=`r)cj8n;Qjm Date: Sat, 4 Oct 2025 18:19:02 +0200 Subject: [PATCH 2/5] Added: fetching polish articles --- .gitignore | 1 + cmd/serve/main.go | 7 ++ config.json | 3 +- example.db | Bin 8192 -> 0 bytes go.mod | 23 ++++- go.sum | 115 +++++++++++++++++++++++ internal/config/setup.go | 3 +- internal/db/setup.go | 4 +- internal/wikipediadl/downloadarticles.go | 79 ++++++++++++++++ internal/wikipediadl/error.go | 7 ++ internal/wikipediadl/fetcharticles.go | 71 ++++++++++++++ 11 files changed, 309 insertions(+), 4 deletions(-) create mode 100644 .gitignore delete mode 100644 example.db create mode 100644 internal/wikipediadl/downloadarticles.go create mode 100644 internal/wikipediadl/error.go create mode 100644 internal/wikipediadl/fetcharticles.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3997bea --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.db \ No newline at end of file diff --git a/cmd/serve/main.go b/cmd/serve/main.go index 66df405..f07a465 100644 --- a/cmd/serve/main.go +++ b/cmd/serve/main.go @@ -1,8 +1,10 @@ package main import ( + "log" "scrap/internal/config" "scrap/internal/db" + "scrap/internal/wikipediadl" ) func main() { @@ -10,4 +12,9 @@ func main() { db.Setup() defer db.Close() + + if err := wikipediadl.FetchArticles(); err != nil { + log.Println(err.Error()) + } + } diff --git a/config.json b/config.json index e210c76..e1c9923 100644 --- a/config.json +++ b/config.json @@ -1,3 +1,4 @@ { - "sql-tables-dir": "./sqltable/" + "sql-tables-dir": "./sqltable/", + "sql-database-name": "scrap.db" } \ No newline at end of file diff --git a/example.db b/example.db deleted file mode 100644 index dd49cbdba3b2b043cac7599ed4541908a204e90a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8192 zcmeI#u?oU45C-6+2tq-05S$7(RuBhEovkI{(8U_WwblrQ+G0%~z*X>_O^cb`<^RcX z+>z{`ew@g#OCX92oA!3YiQ;sQ{ZXVdd^lttY&X%v~$o%5tH!}}`00bZa z0SG_<0uX=z1Rwwb2teS@hcnOP&roOQQe}-U@=`r)cj8n;Qjm 0 && count >= *limiter { + log.Printf("Reached limit of %d articles, stopping.", *limiter) + return + } + } + } + } + + log.Printf("Done. Total articles processed: %d", count) +} diff --git a/internal/wikipediadl/error.go b/internal/wikipediadl/error.go new file mode 100644 index 0000000..755fe66 --- /dev/null +++ b/internal/wikipediadl/error.go @@ -0,0 +1,7 @@ +package wikipediadl + +import "errors" + +var ( + ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles") +) diff --git a/internal/wikipediadl/fetcharticles.go b/internal/wikipediadl/fetcharticles.go new file mode 100644 index 0000000..c1cac77 --- /dev/null +++ b/internal/wikipediadl/fetcharticles.go @@ -0,0 +1,71 @@ +package wikipediadl + +import ( + "fmt" + "log" + "strings" + + "github.com/gocolly/colly" +) + +const ( + DumpDomain = "dumps.wikimedia.org" + DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/" +) + +func getScraper() *colly.Collector { + s := colly.NewCollector( + colly.AllowedDomains(DumpDomain), + ) + + return s +} + +func getAllArticles(s *colly.Collector) []string { + articles := []string{} + + s.OnHTML("a", func(h *colly.HTMLElement) { + article := h.Attr("href") + if !isValidArticle(article) { + return + } + + articles = append(articles, h.Attr("href")) + + }) + + s.OnError(func(r *colly.Response, err error) { + log.Println(r.Request.URL) + }) + + s.Visit(DumpUrl) + + return articles +} + +func isValidArticle(a string) bool { + const ( + validPrefix = "plwiki-latest-pages-articles" + validSuffix = ".bz2" + ) + + if !strings.HasPrefix(a, validPrefix) || !strings.HasSuffix(a, validSuffix) { + return false + } + + article, _ := strings.CutPrefix(a, validPrefix) + + articleIndex := article[0] + return articleIndex >= 48 && articleIndex <= 57 +} + +func FetchArticles() error { + scraper := getScraper() + + articles := getAllArticles(scraper) + for _, a := range articles { + fmt.Println(a) + } + + return nil +} -- 2.49.1 From 6df63dc4c1b08f9973ed503a971b0cd227c32377 Mon Sep 17 00:00:00 2001 From: Oliwier Adamczyk Date: Sat, 4 Oct 2025 23:14:41 +0200 Subject: [PATCH 3/5] Added: downloading and returning wikipedia articles --- api/article/handler.go | 78 +++++++++++++++++ api/article/httperror.go | 14 +++ api/article/request.go | 16 ++++ api/article/response.go | 15 ++++ api/httpio/httperror.go | 24 +++++ api/httpio/request.go | 38 ++++++++ api/httpio/response.go | 21 +++++ api/httpio/urlquery.go | 86 ++++++++++++++++++ api/setup.go | 17 ++++ cmd/serve/main.go | 7 +- go.mod | 2 + go.sum | 4 + internal/article/dto.go | 11 +++ internal/article/error.go | 12 +++ internal/article/irepository.go | 9 ++ internal/article/iservice.go | 6 ++ internal/article/model.go | 13 +++ internal/article/repository.go | 60 +++++++++++++ internal/article/service.go | 107 +++++++++++++++++++++++ internal/db/error.go | 8 ++ internal/wikipediadl/const.go | 6 ++ internal/wikipediadl/downloadarticles.go | 79 ----------------- internal/wikipediadl/error.go | 3 +- internal/wikipediadl/extractarticles.go | 75 ++++++++++++++++ internal/wikipediadl/fetcharticles.go | 23 ++--- sqltable/1_articles.sql | 2 +- 26 files changed, 636 insertions(+), 100 deletions(-) create mode 100644 api/article/handler.go create mode 100644 api/article/httperror.go create mode 100644 api/article/request.go create mode 100644 api/article/response.go create mode 100644 api/httpio/httperror.go create mode 100644 api/httpio/request.go create mode 100644 api/httpio/response.go create mode 100644 api/httpio/urlquery.go create mode 100644 api/setup.go create mode 100644 internal/article/dto.go create mode 100644 internal/article/error.go create mode 100644 internal/article/irepository.go create mode 100644 internal/article/iservice.go create mode 100644 internal/article/model.go create mode 100644 internal/article/repository.go create mode 100644 internal/article/service.go create mode 100644 internal/db/error.go create mode 100644 internal/wikipediadl/const.go delete mode 100644 internal/wikipediadl/downloadarticles.go create mode 100644 internal/wikipediadl/extractarticles.go diff --git a/api/article/handler.go b/api/article/handler.go new file mode 100644 index 0000000..b7416ec --- /dev/null +++ b/api/article/handler.go @@ -0,0 +1,78 @@ +package article + +import ( + "net/http" + "scrap/api/httpio" + "scrap/internal/article" + "scrap/internal/db" +) + +func ArticleDownloadHandler(w http.ResponseWriter, r *http.Request) { + dbInstance := db.GetInstance() + txRepo := db.NewTxRepository(dbInstance) + articleRepo := article.NewArticleRepository() + + service := article.NewArticleService(txRepo, articleRepo) + if err := service.DownloadArticles(); err != nil { + switch err { + default: + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + } + + return + } +} + +func ArticleQueryHandler(w http.ResponseWriter, r *http.Request) { + body, err := httpio.ParseURLQuery[ArticleQueryRequest]( + r, + httpio.URLQueryKey[string]("title"), + ) + if err != nil { + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + return + } + + if httpErr := body.Validate(); httpErr != nil { + httpErr.Raise(w) + return + } + + dbInstance := db.GetInstance() + txRepo := db.NewTxRepository(dbInstance) + articleRepo := article.NewArticleRepository() + + service := article.NewArticleService(txRepo, articleRepo) + + articleQueryData := article.ArticleQueryDTO{ + Title: body.Title, + } + + articles, err := service.QueryArticles(articleQueryData) + if err != nil { + switch err { + case article.ErrArticleTitleInvalidLength: + ErrHttpArticleTitleInvalidLength.Raise(w) + default: + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + } + + return + } + + articlesOut := make([]ArticleResponse, 0, len(articles)) + for _, a := range articles { + ar := ArticleResponse{ + Uuid: a.Uuid, + Title: a.Title, + Content: a.Content, + } + + articlesOut = append(articlesOut, ar) + } + + if err = ArticleQueryResponse(articlesOut).Return(w, http.StatusOK); err != nil { + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + return + } +} diff --git a/api/article/httperror.go b/api/article/httperror.go new file mode 100644 index 0000000..bf88bb6 --- /dev/null +++ b/api/article/httperror.go @@ -0,0 +1,14 @@ +package article + +import ( + "net/http" + "scrap/api/httpio" +) + +var ( + ErrHttpArticleTitleInvalidLength = httpio.HTTPError{ + StatusCode: http.StatusBadRequest, + ErrorCode: "ARTICLE_TITLE_LENGTH", + Message: "Invalid title length.", + } +) diff --git a/api/article/request.go b/api/article/request.go new file mode 100644 index 0000000..7ee6a20 --- /dev/null +++ b/api/article/request.go @@ -0,0 +1,16 @@ +package article + +import "scrap/api/httpio" + +type ArticleQueryRequest struct { + Title string `json:"title"` +} + +func (a ArticleQueryRequest) Validate() *httpio.HTTPError { + titleLength := len(a.Title) + if titleLength < 1 || titleLength > 255 { + return &ErrHttpArticleTitleInvalidLength + } + + return nil +} diff --git a/api/article/response.go b/api/article/response.go new file mode 100644 index 0000000..b8b2401 --- /dev/null +++ b/api/article/response.go @@ -0,0 +1,15 @@ +package article + +import "scrap/api/httpio" + +type ArticleResponse struct { + Uuid string `json:"uuid"` + Title string `json:"title"` + Content string `json:"content"` +} + +func ArticleQueryResponse(articles []ArticleResponse) httpio.ResponseIO { + return httpio.ResponseIO{ + "articles": articles, + } +} diff --git a/api/httpio/httperror.go b/api/httpio/httperror.go new file mode 100644 index 0000000..59de5f8 --- /dev/null +++ b/api/httpio/httperror.go @@ -0,0 +1,24 @@ +package httpio + +import ( + "encoding/json" + "net/http" +) + +type HTTPError struct { + StatusCode int `json:"-"` + ErrorCode string `json:"error-code"` + Message string `json:"message"` +} + +func (h HTTPError) Raise(w http.ResponseWriter) { + jsonBytes, _ := json.Marshal(h) + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(h.StatusCode) + w.Write(jsonBytes) +} + +func RaiseOnlyStatusCode(w http.ResponseWriter, code int) { + http.Error(w, "", code) +} diff --git a/api/httpio/request.go b/api/httpio/request.go new file mode 100644 index 0000000..fb024bd --- /dev/null +++ b/api/httpio/request.go @@ -0,0 +1,38 @@ +package httpio + +import ( + "encoding/json" + "errors" + "io" + "log" + "net/http" +) + +type IRequestIO interface { + // Validates the received request. + Validate() *HTTPError +} + +// Parses request body into the provided struct. +// Throws an error if the body could not be parsed. +func ParseRequestBody[T IRequestIO](r *http.Request) (*T, error) { + requestBytes, err := io.ReadAll(r.Body) + if err != nil { + log.Println(err.Error()) + return nil, err + } + + if !json.Valid(requestBytes) { + return nil, errors.New("invalid JSON format") + } + + var req T + err = json.Unmarshal(requestBytes, &req) + if err != nil { + log.Println(err.Error()) + return nil, err + } + + return &req, nil + +} diff --git a/api/httpio/response.go b/api/httpio/response.go new file mode 100644 index 0000000..897f7df --- /dev/null +++ b/api/httpio/response.go @@ -0,0 +1,21 @@ +package httpio + +import ( + "encoding/json" + "net/http" +) + +type ResponseIO map[string]any + +func (r ResponseIO) Return(w http.ResponseWriter, statusCode int) error { + jsonBytes, err := json.Marshal(r) + if err != nil { + return err + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(jsonBytes) + + return nil +} diff --git a/api/httpio/urlquery.go b/api/httpio/urlquery.go new file mode 100644 index 0000000..70fd69d --- /dev/null +++ b/api/httpio/urlquery.go @@ -0,0 +1,86 @@ +package httpio + +import ( + "encoding/json" + "errors" + "net/http" + "strconv" +) + +type URLQueryValueType interface { + string | int | float32 | float64 | bool +} + +type iURLQueryKeyType interface { + GetKey() string +} + +type URLQueryKeyType[T URLQueryValueType] struct { + Key string + _ T +} + +func (u URLQueryKeyType[T]) GetKey() string { return u.Key } + +func URLQueryKey[T URLQueryValueType](key string) iURLQueryKeyType { + return URLQueryKeyType[T]{ + Key: key, + } +} + +func ParseURLQuery[T IRequestIO](r *http.Request, keys ...iURLQueryKeyType) (*T, error) { + query := make(map[string]any, len(keys)) + + for _, key := range keys { + queryValue := r.URL.Query().Get(key.GetKey()) + + if queryValue == "" { + continue + } + + switch key.(type) { + case URLQueryKeyType[string]: + query[key.GetKey()] = queryValue + case URLQueryKeyType[int]: + x, err := strconv.Atoi(queryValue) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + case URLQueryKeyType[float32]: + x, err := strconv.ParseFloat(queryValue, 32) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + case URLQueryKeyType[float64]: + x, err := strconv.ParseFloat(queryValue, 64) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + case URLQueryKeyType[bool]: + x, err := strconv.ParseBool(queryValue) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + default: + return nil, errors.New("unsupported URL query key type") + } + } + + queryBytes, _ := json.Marshal(query) + + var req T + err := json.Unmarshal(queryBytes, &req) + if err != nil { + return nil, err + } + + return &req, nil +} diff --git a/api/setup.go b/api/setup.go new file mode 100644 index 0000000..7d958b4 --- /dev/null +++ b/api/setup.go @@ -0,0 +1,17 @@ +package api + +import ( + "net/http" + "scrap/api/article" + + "github.com/go-chi/chi" +) + +func Setup() { + r := chi.NewRouter() + + r.Get("/articles", article.ArticleQueryHandler) + r.Get("/articles-download", article.ArticleDownloadHandler) + + http.ListenAndServe(":8080", r) +} diff --git a/cmd/serve/main.go b/cmd/serve/main.go index f07a465..45f945f 100644 --- a/cmd/serve/main.go +++ b/cmd/serve/main.go @@ -2,9 +2,9 @@ package main import ( "log" + "scrap/api" "scrap/internal/config" "scrap/internal/db" - "scrap/internal/wikipediadl" ) func main() { @@ -13,8 +13,7 @@ func main() { db.Setup() defer db.Close() - if err := wikipediadl.FetchArticles(); err != nil { - log.Println(err.Error()) - } + log.SetFlags(log.Lshortfile) + api.Setup() } diff --git a/go.mod b/go.mod index 782b439..766969b 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module scrap go 1.24.4 require ( + github.com/go-chi/chi v1.5.5 github.com/gocolly/colly v1.2.0 github.com/mattn/go-sqlite3 v1.14.32 ) @@ -16,6 +17,7 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/protobuf v1.5.4 // indirect + github.com/google/uuid v1.6.0 github.com/kennygrant/sanitize v1.2.4 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/temoto/robotstxt v1.1.2 // indirect diff --git a/go.sum b/go.sum index 0fe57bd..b6ab6b0 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,8 @@ github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ= github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE= +github.com/go-chi/chi v1.5.5/go.mod h1:C9JqLr3tIYjDOZpzn+BCuxY8z8vmca43EeMgyZt7irw= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= @@ -26,6 +28,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= diff --git a/internal/article/dto.go b/internal/article/dto.go new file mode 100644 index 0000000..23dc451 --- /dev/null +++ b/internal/article/dto.go @@ -0,0 +1,11 @@ +package article + +type ArticleDTO struct { + Uuid string + Title string + Content string +} + +type ArticleQueryDTO struct { + Title string +} diff --git a/internal/article/error.go b/internal/article/error.go new file mode 100644 index 0000000..456df0d --- /dev/null +++ b/internal/article/error.go @@ -0,0 +1,12 @@ +package article + +import "errors" + +var ( + ErrArticleDownloadFailed = errors.New("article: download failed") + ErrArticleQueryFailed = errors.New("article: article query failed") + ErrArticleCreateFailed = errors.New("article: create failed") + ErrArticleDeleteAllFailed = errors.New("article: failed to delete all articles") + + ErrArticleTitleInvalidLength = errors.New("article: invalid article length") +) diff --git a/internal/article/irepository.go b/internal/article/irepository.go new file mode 100644 index 0000000..9736f25 --- /dev/null +++ b/internal/article/irepository.go @@ -0,0 +1,9 @@ +package article + +import "database/sql" + +type IArticleRepository interface { + CreateArticle(tx *sql.Tx, data ArticleCreateModel) error + GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) + DeleteAllArticles(tx *sql.Tx) error +} diff --git a/internal/article/iservice.go b/internal/article/iservice.go new file mode 100644 index 0000000..543eb2a --- /dev/null +++ b/internal/article/iservice.go @@ -0,0 +1,6 @@ +package article + +type IArticleService interface { + DownloadArticles() error + QueryArticles(ArticleQueryDTO) ([]ArticleDTO, error) +} diff --git a/internal/article/model.go b/internal/article/model.go new file mode 100644 index 0000000..84b7e6e --- /dev/null +++ b/internal/article/model.go @@ -0,0 +1,13 @@ +package article + +type ArticleModel struct { + Uuid string + Title string + Content string +} + +type ArticleCreateModel struct { + Uuid string + Title string + Content string +} diff --git a/internal/article/repository.go b/internal/article/repository.go new file mode 100644 index 0000000..7ae685d --- /dev/null +++ b/internal/article/repository.go @@ -0,0 +1,60 @@ +package article + +import ( + "database/sql" + "fmt" +) + +type ArticleRepository struct{} + +func NewArticleRepository() IArticleRepository { + return &ArticleRepository{} +} + +func (ArticleRepository) CreateArticle(tx *sql.Tx, data ArticleCreateModel) error { + query := ` + INSERT INTO articles(uuid, title, content) + VALUES ($1, $2, $3); + ` + + _, err := tx.Exec(query, data.Uuid, data.Title, data.Content) + return err +} + +func (ArticleRepository) GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) { + fmt.Println(title, " ------------------") + query := ` + SELECT uuid, title, content + FROM articles + WHERE title LIKE $1 || '%' + LIMIT 10; + ` + + rows, err := tx.Query(query, title) + if err != nil { + return nil, err + } + + articles := []ArticleModel{} + for rows.Next() { + var a ArticleModel + + err = rows.Scan(&a.Uuid, &a.Title, &a.Content) + if err != nil { + return nil, err + } + + articles = append(articles, a) + } + + fmt.Println(articles) + + return articles, nil +} + +func (ArticleRepository) DeleteAllArticles(tx *sql.Tx) error { + query := `DELETE FROM articles;` + + _, err := tx.Exec(query) + return err +} diff --git a/internal/article/service.go b/internal/article/service.go new file mode 100644 index 0000000..882c60d --- /dev/null +++ b/internal/article/service.go @@ -0,0 +1,107 @@ +package article + +import ( + "log" + "scrap/internal/db" + "scrap/internal/wikipediadl" + + "github.com/google/uuid" +) + +type ArticleService struct { + txRepo db.ITxRepository + articleRepo IArticleRepository +} + +func NewArticleService( + txRepo db.ITxRepository, + articleRepo IArticleRepository, +) IArticleService { + return &ArticleService{ + txRepo: txRepo, + articleRepo: articleRepo, + } +} + +func (a ArticleService) QueryArticles(data ArticleQueryDTO) ([]ArticleDTO, error) { + tx, err := a.txRepo.Begin() + if err != nil { + log.Println(err.Error()) + return nil, err + } + + defer a.txRepo.RollbackOnError(tx, &err) + + articleTitleLength := len(data.Title) + if articleTitleLength < 1 || articleTitleLength > 255 { + return nil, ErrArticleTitleInvalidLength + } + + articles, err := a.articleRepo.GetArticlesByTitle(tx, data.Title) + if err != nil { + log.Println(err.Error()) + return nil, ErrArticleQueryFailed + } + + articlesOut := make([]ArticleDTO, 0, len(articles)) + for _, am := range articles { + a := ArticleDTO{ + Uuid: am.Uuid, + Title: am.Title, + Content: am.Content, + } + + articlesOut = append(articlesOut, a) + } + + return articlesOut, nil +} + +func (a ArticleService) DownloadArticles() error { + tx, err := a.txRepo.Begin() + if err != nil { + log.Println(err.Error()) + return db.ErrTxBeginFailed + } + + defer a.txRepo.RollbackOnError(tx, &err) + + if err = a.articleRepo.DeleteAllArticles(tx); err != nil { + log.Println(err.Error()) + return ErrArticleDeleteAllFailed + } + + articleBundles, err := wikipediadl.FetchArticleBundles() + if err != nil { + log.Println(err.Error()) + return ErrArticleDownloadFailed + } + + for _, ab := range articleBundles { + articles, err := wikipediadl.ExtractArticles(ab) + if err != nil { + log.Println(err.Error()) + return ErrArticleDownloadFailed + } + + for _, article := range articles { + articleData := ArticleCreateModel{ + Uuid: uuid.NewString(), + Title: article.Title, + Content: article.Revision.Text, + } + + if err = a.articleRepo.CreateArticle(tx, articleData); err != nil { + log.Println(err.Error(), "tutaj ---------") + continue + } + } + } + + if err = a.txRepo.Commit(tx); err != nil { + log.Println(err.Error()) + return db.ErrTxCommitFailed + } + + return nil +} diff --git a/internal/db/error.go b/internal/db/error.go new file mode 100644 index 0000000..7e72d10 --- /dev/null +++ b/internal/db/error.go @@ -0,0 +1,8 @@ +package db + +import "errors" + +var ( + ErrTxBeginFailed = errors.New("tx: could not begin a Tx") + ErrTxCommitFailed = errors.New("tx: could not commit the Tx") +) diff --git a/internal/wikipediadl/const.go b/internal/wikipediadl/const.go new file mode 100644 index 0000000..a5c66ee --- /dev/null +++ b/internal/wikipediadl/const.go @@ -0,0 +1,6 @@ +package wikipediadl + +const ( + WikipediaDumpDomain = "dumps.wikimedia.org" + WikipediaDumpUrl = "https://" + WikipediaDumpDomain + "/plwiki/latest/" +) diff --git a/internal/wikipediadl/downloadarticles.go b/internal/wikipediadl/downloadarticles.go deleted file mode 100644 index 2d308ab..0000000 --- a/internal/wikipediadl/downloadarticles.go +++ /dev/null @@ -1,79 +0,0 @@ -package wikipediadl - -import ( - "compress/bzip2" - "encoding/xml" - "flag" - "fmt" - "io" - "log" - "net/http" -) - -type Page struct { - Title string `xml:"title"` - Revision Revision `xml:"revision"` -} - -type Revision struct { - Text string `xml:"text"` -} - -func DownloadArticles() { - url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2" - - limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all") - flag.Parse() - - log.Printf("Downloading chunk: %s", url) - resp, err := http.Get(url) - if err != nil { - log.Fatalf("Failed to download chunk: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode != 200 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) - log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body)) - } - - bz2Reader := bzip2.NewReader(resp.Body) - dec := xml.NewDecoder(bz2Reader) - - count := 0 - for { - tok, err := dec.Token() - if err != nil { - if err == io.EOF { - log.Println("Reached end of chunk") - break - } - log.Fatalf("XML token error: %v", err) - } - - switch se := tok.(type) { - case xml.StartElement: - if se.Name.Local == "page" { - var p Page - if err := dec.DecodeElement(&p, &se); err != nil { - log.Printf("Error decoding page: %v", err) - continue - } - - count++ - fmt.Printf("---- Article %d ----\n", count) - fmt.Printf("Title: %s\n", p.Title) - // fmt.Println("Content:") - // fmt.Println(p.Revision.Text) - fmt.Println("--------------------\n") - - if *limiter > 0 && count >= *limiter { - log.Printf("Reached limit of %d articles, stopping.", *limiter) - return - } - } - } - } - - log.Printf("Done. Total articles processed: %d", count) -} diff --git a/internal/wikipediadl/error.go b/internal/wikipediadl/error.go index 755fe66..06b3bcc 100644 --- a/internal/wikipediadl/error.go +++ b/internal/wikipediadl/error.go @@ -3,5 +3,6 @@ package wikipediadl import "errors" var ( - ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles") + ErrArticleBundleFetchFailed = errors.New("wikipediadl: failed to fetch article bundles") + ErrArticleDownloadFailed = errors.New("wikipediadl: failed to extract articles") ) diff --git a/internal/wikipediadl/extractarticles.go b/internal/wikipediadl/extractarticles.go new file mode 100644 index 0000000..ee7af3a --- /dev/null +++ b/internal/wikipediadl/extractarticles.go @@ -0,0 +1,75 @@ +package wikipediadl + +import ( + "compress/bzip2" + "encoding/xml" + "errors" + "io" + "log" + "net/http" +) + +type WikiArticle struct { + Title string `xml:"title"` + Revision Revision `xml:"revision"` +} + +type Revision struct { + Text string `xml:"text"` +} + +func ExtractArticles(bundle string) ([]WikiArticle, error) { + url := WikipediaDumpUrl + bundle + + resp, err := http.Get(url) + if err != nil { + log.Println(err.Error()) + return nil, errors.New("wikipediadl: failed load articles") + } + + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return nil, errors.New("wikipediadl: bad response status") + } + + bz2Reader := bzip2.NewReader(resp.Body) + xmlDec := xml.NewDecoder(bz2Reader) + + count := 0 + + articles := []WikiArticle{} +Loop: + for { + tok, err := xmlDec.Token() + if err != nil { + if err == io.EOF { + break + } + + return nil, errors.New("XML token error") + } + + switch se := tok.(type) { + case xml.StartElement: + if count == 2 { // XXX: remove later + break Loop + } + + if se.Name.Local != "page" { + continue + } + + var p WikiArticle + if err := xmlDec.DecodeElement(&p, &se); err != nil { + log.Println(err.Error()) + continue + } + + articles = append(articles, p) + count++ // XXX: remove later + } + } + + return articles, nil +} diff --git a/internal/wikipediadl/fetcharticles.go b/internal/wikipediadl/fetcharticles.go index c1cac77..c446c05 100644 --- a/internal/wikipediadl/fetcharticles.go +++ b/internal/wikipediadl/fetcharticles.go @@ -1,7 +1,6 @@ package wikipediadl import ( - "fmt" "log" "strings" @@ -13,12 +12,17 @@ const ( DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/" ) +func FetchArticleBundles() ([]string, error) { + scraper := getScraper() + + articles := getAllArticles(scraper) + return articles, nil +} + func getScraper() *colly.Collector { - s := colly.NewCollector( + return colly.NewCollector( colly.AllowedDomains(DumpDomain), ) - - return s } func getAllArticles(s *colly.Collector) []string { @@ -58,14 +62,3 @@ func isValidArticle(a string) bool { articleIndex := article[0] return articleIndex >= 48 && articleIndex <= 57 } - -func FetchArticles() error { - scraper := getScraper() - - articles := getAllArticles(scraper) - for _, a := range articles { - fmt.Println(a) - } - - return nil -} diff --git a/sqltable/1_articles.sql b/sqltable/1_articles.sql index 89f9137..2ffe654 100644 --- a/sqltable/1_articles.sql +++ b/sqltable/1_articles.sql @@ -1,5 +1,5 @@ CREATE TABLE IF NOT EXISTS articles( - uuid CHAR(36), + uuid CHAR(36) PRIMARY KEY, title VARCHAR(255), content TEXT ) \ No newline at end of file -- 2.49.1 From f5040fd91ea8cfa2b653461d157d34fb1233a550 Mon Sep 17 00:00:00 2001 From: Ziemniak Date: Sun, 5 Oct 2025 04:19:21 +0200 Subject: [PATCH 4/5] OSM --- api/setup.go | 1 + cmd/serve/main.go | 3 + internal/osm/osm.go | 152 ++++++++++++++++++++++++++++++++++++++++ sqltable/1_articles.sql | 4 +- 4 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 internal/osm/osm.go diff --git a/api/setup.go b/api/setup.go index 7d958b4..06cab1b 100644 --- a/api/setup.go +++ b/api/setup.go @@ -12,6 +12,7 @@ func Setup() { r.Get("/articles", article.ArticleQueryHandler) r.Get("/articles-download", article.ArticleDownloadHandler) + r.Handle("/tiles/", http.StripPrefix("/tiles/", http.FileServer(http.Dir("tiles")))) http.ListenAndServe(":8080", r) } diff --git a/cmd/serve/main.go b/cmd/serve/main.go index 45f945f..6769b0e 100644 --- a/cmd/serve/main.go +++ b/cmd/serve/main.go @@ -5,6 +5,7 @@ import ( "scrap/api" "scrap/internal/config" "scrap/internal/db" + "scrap/internal/osm" ) func main() { @@ -15,5 +16,7 @@ func main() { log.SetFlags(log.Lshortfile) + osm.OSM() + api.Setup() } diff --git a/internal/osm/osm.go b/internal/osm/osm.go new file mode 100644 index 0000000..865a618 --- /dev/null +++ b/internal/osm/osm.go @@ -0,0 +1,152 @@ +package osm + +import ( + "context" + "fmt" + "io" + "log" + "math" + "net/http" + "os" + "path/filepath" + "runtime" + "strconv" + "sync" + "time" +) + +var ( + userLat = 50.06465 + userLon = 19.94598 + radiusM = 2000.0 + maxZoom = 17 + userAgent = "krakow-tiles-downloader/1.0 (+your_email@example.com)" + osmTileURL = "https://tile.openstreetmap.org/%d/%d/%d.png" + tilesDir = "tiles" +) + +const earthRadius = 6378137.0 + +func offsetLatLon(lat, lon, distance, bearingRad float64) (float64, float64) { + r := earthRadius + latRad := lat * math.Pi / 180.0 + lonRad := lon * math.Pi / 180.0 + angDist := distance / r + newLatRad := math.Asin(math.Sin(latRad)*math.Cos(angDist) + math.Cos(latRad)*math.Sin(angDist)*math.Cos(bearingRad)) + newLonRad := lonRad + math.Atan2(math.Sin(bearingRad)*math.Sin(angDist)*math.Cos(latRad), + math.Cos(angDist)-math.Sin(latRad)*math.Sin(newLatRad)) + return newLatRad * 180.0 / math.Pi, newLonRad * 180.0 / math.Pi +} + +func boundingBoxForCircle(lat, lon, radius float64) (minLat, maxLat, minLon, maxLon float64) { + maxLat, _ = offsetLatLon(lat, lon, radius, 0) + minLat, _ = offsetLatLon(lat, lon, radius, math.Pi) + _, maxLon = offsetLatLon(lat, lon, radius, math.Pi/2) + _, minLon = offsetLatLon(lat, lon, radius, 3*math.Pi/2) + return +} + +func latLonToTile(lat, lon float64, z int) (x, y int) { + latRad := lat * math.Pi / 180.0 + n := math.Pow(2.0, float64(z)) + xFloat := (lon + 180.0) / 360.0 * n + yFloat := (1.0 - math.Log(math.Tan(latRad)+1.0/math.Cos(latRad))/math.Pi) / 2.0 * n + return int(math.Floor(xFloat)), int(math.Floor(yFloat)) +} + +func tileXYBounds(minLat, maxLat, minLon, maxLon float64, z int) (minX, maxX, minY, maxY int) { + x1, y1 := latLonToTile(maxLat, minLon, z) + x2, y2 := latLonToTile(minLat, maxLon, z) + if x1 > x2 { + minX, maxX = x2, x1 + } else { + minX, maxX = x1, x2 + } + if y1 > y2 { + minY, maxY = y2, y1 + } else { + minY, maxY = y1, y2 + } + return +} + +func downloadTile(ctx context.Context, z, x, y int) error { + url := fmt.Sprintf(osmTileURL, z, x, y) + req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) + req.Header.Set("User-Agent", userAgent) + resp, err := http.DefaultClient.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + if resp.StatusCode == 429 || resp.StatusCode == 403 { + time.Sleep(5 * time.Second) + return nil + } + if resp.StatusCode != 200 { + return fmt.Errorf("HTTP %d for %s", resp.StatusCode, url) + } + + tileCount := 1 << uint(z) + yFlipped := tileCount - 1 - y + path := filepath.Join(tilesDir, strconv.Itoa(z), strconv.Itoa(x)) + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + filePath := filepath.Join(path, fmt.Sprintf("%d.png", yFlipped)) + f, err := os.Create(filePath) + if err != nil { + return err + } + defer f.Close() + _, err = io.Copy(f, resp.Body) + return err +} + +func downloadRange(ctx context.Context, z, minX, maxX, minY, maxY int) { + log.Printf("Downloading tiles zoom %d: x %d..%d, y %d..%d", z, minX, maxX, minY, maxY) + sem := make(chan struct{}, runtime.NumCPU()) + var wg sync.WaitGroup + rate := time.NewTicker(1 * time.Second) + defer rate.Stop() + + for x := minX; x <= maxX; x++ { + for y := minY; y <= maxY; y++ { + wg.Add(1) + sem <- struct{}{} + <-rate.C + go func(x, y int) { + defer wg.Done() + defer func() { <-sem }() + filePath := filepath.Join(tilesDir, strconv.Itoa(z), strconv.Itoa(x), fmt.Sprintf("%d.png", (1< Date: Sun, 5 Oct 2025 04:23:49 +0200 Subject: [PATCH 5/5] Gitignore update --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3997bea..d079ff2 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.db \ No newline at end of file +*.db +tiles/ \ No newline at end of file -- 2.49.1