From 6df63dc4c1b08f9973ed503a971b0cd227c32377 Mon Sep 17 00:00:00 2001 From: Oliwier Adamczyk Date: Sat, 4 Oct 2025 23:14:41 +0200 Subject: [PATCH] Added: downloading and returning wikipedia articles --- api/article/handler.go | 78 +++++++++++++++++ api/article/httperror.go | 14 +++ api/article/request.go | 16 ++++ api/article/response.go | 15 ++++ api/httpio/httperror.go | 24 +++++ api/httpio/request.go | 38 ++++++++ api/httpio/response.go | 21 +++++ api/httpio/urlquery.go | 86 ++++++++++++++++++ api/setup.go | 17 ++++ cmd/serve/main.go | 7 +- go.mod | 2 + go.sum | 4 + internal/article/dto.go | 11 +++ internal/article/error.go | 12 +++ internal/article/irepository.go | 9 ++ internal/article/iservice.go | 6 ++ internal/article/model.go | 13 +++ internal/article/repository.go | 60 +++++++++++++ internal/article/service.go | 107 +++++++++++++++++++++++ internal/db/error.go | 8 ++ internal/wikipediadl/const.go | 6 ++ internal/wikipediadl/downloadarticles.go | 79 ----------------- internal/wikipediadl/error.go | 3 +- internal/wikipediadl/extractarticles.go | 75 ++++++++++++++++ internal/wikipediadl/fetcharticles.go | 23 ++--- sqltable/1_articles.sql | 2 +- 26 files changed, 636 insertions(+), 100 deletions(-) create mode 100644 api/article/handler.go create mode 100644 api/article/httperror.go create mode 100644 api/article/request.go create mode 100644 api/article/response.go create mode 100644 api/httpio/httperror.go create mode 100644 api/httpio/request.go create mode 100644 api/httpio/response.go create mode 100644 api/httpio/urlquery.go create mode 100644 api/setup.go create mode 100644 internal/article/dto.go create mode 100644 internal/article/error.go create mode 100644 internal/article/irepository.go create mode 100644 internal/article/iservice.go create mode 100644 internal/article/model.go create mode 100644 internal/article/repository.go create mode 100644 internal/article/service.go create mode 100644 internal/db/error.go create mode 100644 internal/wikipediadl/const.go delete mode 100644 internal/wikipediadl/downloadarticles.go create mode 100644 internal/wikipediadl/extractarticles.go diff --git a/api/article/handler.go b/api/article/handler.go new file mode 100644 index 0000000..b7416ec --- /dev/null +++ b/api/article/handler.go @@ -0,0 +1,78 @@ +package article + +import ( + "net/http" + "scrap/api/httpio" + "scrap/internal/article" + "scrap/internal/db" +) + +func ArticleDownloadHandler(w http.ResponseWriter, r *http.Request) { + dbInstance := db.GetInstance() + txRepo := db.NewTxRepository(dbInstance) + articleRepo := article.NewArticleRepository() + + service := article.NewArticleService(txRepo, articleRepo) + if err := service.DownloadArticles(); err != nil { + switch err { + default: + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + } + + return + } +} + +func ArticleQueryHandler(w http.ResponseWriter, r *http.Request) { + body, err := httpio.ParseURLQuery[ArticleQueryRequest]( + r, + httpio.URLQueryKey[string]("title"), + ) + if err != nil { + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + return + } + + if httpErr := body.Validate(); httpErr != nil { + httpErr.Raise(w) + return + } + + dbInstance := db.GetInstance() + txRepo := db.NewTxRepository(dbInstance) + articleRepo := article.NewArticleRepository() + + service := article.NewArticleService(txRepo, articleRepo) + + articleQueryData := article.ArticleQueryDTO{ + Title: body.Title, + } + + articles, err := service.QueryArticles(articleQueryData) + if err != nil { + switch err { + case article.ErrArticleTitleInvalidLength: + ErrHttpArticleTitleInvalidLength.Raise(w) + default: + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + } + + return + } + + articlesOut := make([]ArticleResponse, 0, len(articles)) + for _, a := range articles { + ar := ArticleResponse{ + Uuid: a.Uuid, + Title: a.Title, + Content: a.Content, + } + + articlesOut = append(articlesOut, ar) + } + + if err = ArticleQueryResponse(articlesOut).Return(w, http.StatusOK); err != nil { + httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError) + return + } +} diff --git a/api/article/httperror.go b/api/article/httperror.go new file mode 100644 index 0000000..bf88bb6 --- /dev/null +++ b/api/article/httperror.go @@ -0,0 +1,14 @@ +package article + +import ( + "net/http" + "scrap/api/httpio" +) + +var ( + ErrHttpArticleTitleInvalidLength = httpio.HTTPError{ + StatusCode: http.StatusBadRequest, + ErrorCode: "ARTICLE_TITLE_LENGTH", + Message: "Invalid title length.", + } +) diff --git a/api/article/request.go b/api/article/request.go new file mode 100644 index 0000000..7ee6a20 --- /dev/null +++ b/api/article/request.go @@ -0,0 +1,16 @@ +package article + +import "scrap/api/httpio" + +type ArticleQueryRequest struct { + Title string `json:"title"` +} + +func (a ArticleQueryRequest) Validate() *httpio.HTTPError { + titleLength := len(a.Title) + if titleLength < 1 || titleLength > 255 { + return &ErrHttpArticleTitleInvalidLength + } + + return nil +} diff --git a/api/article/response.go b/api/article/response.go new file mode 100644 index 0000000..b8b2401 --- /dev/null +++ b/api/article/response.go @@ -0,0 +1,15 @@ +package article + +import "scrap/api/httpio" + +type ArticleResponse struct { + Uuid string `json:"uuid"` + Title string `json:"title"` + Content string `json:"content"` +} + +func ArticleQueryResponse(articles []ArticleResponse) httpio.ResponseIO { + return httpio.ResponseIO{ + "articles": articles, + } +} diff --git a/api/httpio/httperror.go b/api/httpio/httperror.go new file mode 100644 index 0000000..59de5f8 --- /dev/null +++ b/api/httpio/httperror.go @@ -0,0 +1,24 @@ +package httpio + +import ( + "encoding/json" + "net/http" +) + +type HTTPError struct { + StatusCode int `json:"-"` + ErrorCode string `json:"error-code"` + Message string `json:"message"` +} + +func (h HTTPError) Raise(w http.ResponseWriter) { + jsonBytes, _ := json.Marshal(h) + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(h.StatusCode) + w.Write(jsonBytes) +} + +func RaiseOnlyStatusCode(w http.ResponseWriter, code int) { + http.Error(w, "", code) +} diff --git a/api/httpio/request.go b/api/httpio/request.go new file mode 100644 index 0000000..fb024bd --- /dev/null +++ b/api/httpio/request.go @@ -0,0 +1,38 @@ +package httpio + +import ( + "encoding/json" + "errors" + "io" + "log" + "net/http" +) + +type IRequestIO interface { + // Validates the received request. + Validate() *HTTPError +} + +// Parses request body into the provided struct. +// Throws an error if the body could not be parsed. +func ParseRequestBody[T IRequestIO](r *http.Request) (*T, error) { + requestBytes, err := io.ReadAll(r.Body) + if err != nil { + log.Println(err.Error()) + return nil, err + } + + if !json.Valid(requestBytes) { + return nil, errors.New("invalid JSON format") + } + + var req T + err = json.Unmarshal(requestBytes, &req) + if err != nil { + log.Println(err.Error()) + return nil, err + } + + return &req, nil + +} diff --git a/api/httpio/response.go b/api/httpio/response.go new file mode 100644 index 0000000..897f7df --- /dev/null +++ b/api/httpio/response.go @@ -0,0 +1,21 @@ +package httpio + +import ( + "encoding/json" + "net/http" +) + +type ResponseIO map[string]any + +func (r ResponseIO) Return(w http.ResponseWriter, statusCode int) error { + jsonBytes, err := json.Marshal(r) + if err != nil { + return err + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(statusCode) + w.Write(jsonBytes) + + return nil +} diff --git a/api/httpio/urlquery.go b/api/httpio/urlquery.go new file mode 100644 index 0000000..70fd69d --- /dev/null +++ b/api/httpio/urlquery.go @@ -0,0 +1,86 @@ +package httpio + +import ( + "encoding/json" + "errors" + "net/http" + "strconv" +) + +type URLQueryValueType interface { + string | int | float32 | float64 | bool +} + +type iURLQueryKeyType interface { + GetKey() string +} + +type URLQueryKeyType[T URLQueryValueType] struct { + Key string + _ T +} + +func (u URLQueryKeyType[T]) GetKey() string { return u.Key } + +func URLQueryKey[T URLQueryValueType](key string) iURLQueryKeyType { + return URLQueryKeyType[T]{ + Key: key, + } +} + +func ParseURLQuery[T IRequestIO](r *http.Request, keys ...iURLQueryKeyType) (*T, error) { + query := make(map[string]any, len(keys)) + + for _, key := range keys { + queryValue := r.URL.Query().Get(key.GetKey()) + + if queryValue == "" { + continue + } + + switch key.(type) { + case URLQueryKeyType[string]: + query[key.GetKey()] = queryValue + case URLQueryKeyType[int]: + x, err := strconv.Atoi(queryValue) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + case URLQueryKeyType[float32]: + x, err := strconv.ParseFloat(queryValue, 32) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + case URLQueryKeyType[float64]: + x, err := strconv.ParseFloat(queryValue, 64) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + case URLQueryKeyType[bool]: + x, err := strconv.ParseBool(queryValue) + if err != nil { + return nil, err + } + + query[key.GetKey()] = x + default: + return nil, errors.New("unsupported URL query key type") + } + } + + queryBytes, _ := json.Marshal(query) + + var req T + err := json.Unmarshal(queryBytes, &req) + if err != nil { + return nil, err + } + + return &req, nil +} diff --git a/api/setup.go b/api/setup.go new file mode 100644 index 0000000..7d958b4 --- /dev/null +++ b/api/setup.go @@ -0,0 +1,17 @@ +package api + +import ( + "net/http" + "scrap/api/article" + + "github.com/go-chi/chi" +) + +func Setup() { + r := chi.NewRouter() + + r.Get("/articles", article.ArticleQueryHandler) + r.Get("/articles-download", article.ArticleDownloadHandler) + + http.ListenAndServe(":8080", r) +} diff --git a/cmd/serve/main.go b/cmd/serve/main.go index f07a465..45f945f 100644 --- a/cmd/serve/main.go +++ b/cmd/serve/main.go @@ -2,9 +2,9 @@ package main import ( "log" + "scrap/api" "scrap/internal/config" "scrap/internal/db" - "scrap/internal/wikipediadl" ) func main() { @@ -13,8 +13,7 @@ func main() { db.Setup() defer db.Close() - if err := wikipediadl.FetchArticles(); err != nil { - log.Println(err.Error()) - } + log.SetFlags(log.Lshortfile) + api.Setup() } diff --git a/go.mod b/go.mod index 782b439..766969b 100644 --- a/go.mod +++ b/go.mod @@ -3,6 +3,7 @@ module scrap go 1.24.4 require ( + github.com/go-chi/chi v1.5.5 github.com/gocolly/colly v1.2.0 github.com/mattn/go-sqlite3 v1.14.32 ) @@ -16,6 +17,7 @@ require ( github.com/gobwas/glob v0.2.3 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/protobuf v1.5.4 // indirect + github.com/google/uuid v1.6.0 github.com/kennygrant/sanitize v1.2.4 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect github.com/temoto/robotstxt v1.1.2 // indirect diff --git a/go.sum b/go.sum index 0fe57bd..b6ab6b0 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,8 @@ github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ= github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE= +github.com/go-chi/chi v1.5.5/go.mod h1:C9JqLr3tIYjDOZpzn+BCuxY8z8vmca43EeMgyZt7irw= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI= @@ -26,6 +28,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs= diff --git a/internal/article/dto.go b/internal/article/dto.go new file mode 100644 index 0000000..23dc451 --- /dev/null +++ b/internal/article/dto.go @@ -0,0 +1,11 @@ +package article + +type ArticleDTO struct { + Uuid string + Title string + Content string +} + +type ArticleQueryDTO struct { + Title string +} diff --git a/internal/article/error.go b/internal/article/error.go new file mode 100644 index 0000000..456df0d --- /dev/null +++ b/internal/article/error.go @@ -0,0 +1,12 @@ +package article + +import "errors" + +var ( + ErrArticleDownloadFailed = errors.New("article: download failed") + ErrArticleQueryFailed = errors.New("article: article query failed") + ErrArticleCreateFailed = errors.New("article: create failed") + ErrArticleDeleteAllFailed = errors.New("article: failed to delete all articles") + + ErrArticleTitleInvalidLength = errors.New("article: invalid article length") +) diff --git a/internal/article/irepository.go b/internal/article/irepository.go new file mode 100644 index 0000000..9736f25 --- /dev/null +++ b/internal/article/irepository.go @@ -0,0 +1,9 @@ +package article + +import "database/sql" + +type IArticleRepository interface { + CreateArticle(tx *sql.Tx, data ArticleCreateModel) error + GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) + DeleteAllArticles(tx *sql.Tx) error +} diff --git a/internal/article/iservice.go b/internal/article/iservice.go new file mode 100644 index 0000000..543eb2a --- /dev/null +++ b/internal/article/iservice.go @@ -0,0 +1,6 @@ +package article + +type IArticleService interface { + DownloadArticles() error + QueryArticles(ArticleQueryDTO) ([]ArticleDTO, error) +} diff --git a/internal/article/model.go b/internal/article/model.go new file mode 100644 index 0000000..84b7e6e --- /dev/null +++ b/internal/article/model.go @@ -0,0 +1,13 @@ +package article + +type ArticleModel struct { + Uuid string + Title string + Content string +} + +type ArticleCreateModel struct { + Uuid string + Title string + Content string +} diff --git a/internal/article/repository.go b/internal/article/repository.go new file mode 100644 index 0000000..7ae685d --- /dev/null +++ b/internal/article/repository.go @@ -0,0 +1,60 @@ +package article + +import ( + "database/sql" + "fmt" +) + +type ArticleRepository struct{} + +func NewArticleRepository() IArticleRepository { + return &ArticleRepository{} +} + +func (ArticleRepository) CreateArticle(tx *sql.Tx, data ArticleCreateModel) error { + query := ` + INSERT INTO articles(uuid, title, content) + VALUES ($1, $2, $3); + ` + + _, err := tx.Exec(query, data.Uuid, data.Title, data.Content) + return err +} + +func (ArticleRepository) GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) { + fmt.Println(title, " ------------------") + query := ` + SELECT uuid, title, content + FROM articles + WHERE title LIKE $1 || '%' + LIMIT 10; + ` + + rows, err := tx.Query(query, title) + if err != nil { + return nil, err + } + + articles := []ArticleModel{} + for rows.Next() { + var a ArticleModel + + err = rows.Scan(&a.Uuid, &a.Title, &a.Content) + if err != nil { + return nil, err + } + + articles = append(articles, a) + } + + fmt.Println(articles) + + return articles, nil +} + +func (ArticleRepository) DeleteAllArticles(tx *sql.Tx) error { + query := `DELETE FROM articles;` + + _, err := tx.Exec(query) + return err +} diff --git a/internal/article/service.go b/internal/article/service.go new file mode 100644 index 0000000..882c60d --- /dev/null +++ b/internal/article/service.go @@ -0,0 +1,107 @@ +package article + +import ( + "log" + "scrap/internal/db" + "scrap/internal/wikipediadl" + + "github.com/google/uuid" +) + +type ArticleService struct { + txRepo db.ITxRepository + articleRepo IArticleRepository +} + +func NewArticleService( + txRepo db.ITxRepository, + articleRepo IArticleRepository, +) IArticleService { + return &ArticleService{ + txRepo: txRepo, + articleRepo: articleRepo, + } +} + +func (a ArticleService) QueryArticles(data ArticleQueryDTO) ([]ArticleDTO, error) { + tx, err := a.txRepo.Begin() + if err != nil { + log.Println(err.Error()) + return nil, err + } + + defer a.txRepo.RollbackOnError(tx, &err) + + articleTitleLength := len(data.Title) + if articleTitleLength < 1 || articleTitleLength > 255 { + return nil, ErrArticleTitleInvalidLength + } + + articles, err := a.articleRepo.GetArticlesByTitle(tx, data.Title) + if err != nil { + log.Println(err.Error()) + return nil, ErrArticleQueryFailed + } + + articlesOut := make([]ArticleDTO, 0, len(articles)) + for _, am := range articles { + a := ArticleDTO{ + Uuid: am.Uuid, + Title: am.Title, + Content: am.Content, + } + + articlesOut = append(articlesOut, a) + } + + return articlesOut, nil +} + +func (a ArticleService) DownloadArticles() error { + tx, err := a.txRepo.Begin() + if err != nil { + log.Println(err.Error()) + return db.ErrTxBeginFailed + } + + defer a.txRepo.RollbackOnError(tx, &err) + + if err = a.articleRepo.DeleteAllArticles(tx); err != nil { + log.Println(err.Error()) + return ErrArticleDeleteAllFailed + } + + articleBundles, err := wikipediadl.FetchArticleBundles() + if err != nil { + log.Println(err.Error()) + return ErrArticleDownloadFailed + } + + for _, ab := range articleBundles { + articles, err := wikipediadl.ExtractArticles(ab) + if err != nil { + log.Println(err.Error()) + return ErrArticleDownloadFailed + } + + for _, article := range articles { + articleData := ArticleCreateModel{ + Uuid: uuid.NewString(), + Title: article.Title, + Content: article.Revision.Text, + } + + if err = a.articleRepo.CreateArticle(tx, articleData); err != nil { + log.Println(err.Error(), "tutaj ---------") + continue + } + } + } + + if err = a.txRepo.Commit(tx); err != nil { + log.Println(err.Error()) + return db.ErrTxCommitFailed + } + + return nil +} diff --git a/internal/db/error.go b/internal/db/error.go new file mode 100644 index 0000000..7e72d10 --- /dev/null +++ b/internal/db/error.go @@ -0,0 +1,8 @@ +package db + +import "errors" + +var ( + ErrTxBeginFailed = errors.New("tx: could not begin a Tx") + ErrTxCommitFailed = errors.New("tx: could not commit the Tx") +) diff --git a/internal/wikipediadl/const.go b/internal/wikipediadl/const.go new file mode 100644 index 0000000..a5c66ee --- /dev/null +++ b/internal/wikipediadl/const.go @@ -0,0 +1,6 @@ +package wikipediadl + +const ( + WikipediaDumpDomain = "dumps.wikimedia.org" + WikipediaDumpUrl = "https://" + WikipediaDumpDomain + "/plwiki/latest/" +) diff --git a/internal/wikipediadl/downloadarticles.go b/internal/wikipediadl/downloadarticles.go deleted file mode 100644 index 2d308ab..0000000 --- a/internal/wikipediadl/downloadarticles.go +++ /dev/null @@ -1,79 +0,0 @@ -package wikipediadl - -import ( - "compress/bzip2" - "encoding/xml" - "flag" - "fmt" - "io" - "log" - "net/http" -) - -type Page struct { - Title string `xml:"title"` - Revision Revision `xml:"revision"` -} - -type Revision struct { - Text string `xml:"text"` -} - -func DownloadArticles() { - url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2" - - limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all") - flag.Parse() - - log.Printf("Downloading chunk: %s", url) - resp, err := http.Get(url) - if err != nil { - log.Fatalf("Failed to download chunk: %v", err) - } - defer resp.Body.Close() - - if resp.StatusCode != 200 { - body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) - log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body)) - } - - bz2Reader := bzip2.NewReader(resp.Body) - dec := xml.NewDecoder(bz2Reader) - - count := 0 - for { - tok, err := dec.Token() - if err != nil { - if err == io.EOF { - log.Println("Reached end of chunk") - break - } - log.Fatalf("XML token error: %v", err) - } - - switch se := tok.(type) { - case xml.StartElement: - if se.Name.Local == "page" { - var p Page - if err := dec.DecodeElement(&p, &se); err != nil { - log.Printf("Error decoding page: %v", err) - continue - } - - count++ - fmt.Printf("---- Article %d ----\n", count) - fmt.Printf("Title: %s\n", p.Title) - // fmt.Println("Content:") - // fmt.Println(p.Revision.Text) - fmt.Println("--------------------\n") - - if *limiter > 0 && count >= *limiter { - log.Printf("Reached limit of %d articles, stopping.", *limiter) - return - } - } - } - } - - log.Printf("Done. Total articles processed: %d", count) -} diff --git a/internal/wikipediadl/error.go b/internal/wikipediadl/error.go index 755fe66..06b3bcc 100644 --- a/internal/wikipediadl/error.go +++ b/internal/wikipediadl/error.go @@ -3,5 +3,6 @@ package wikipediadl import "errors" var ( - ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles") + ErrArticleBundleFetchFailed = errors.New("wikipediadl: failed to fetch article bundles") + ErrArticleDownloadFailed = errors.New("wikipediadl: failed to extract articles") ) diff --git a/internal/wikipediadl/extractarticles.go b/internal/wikipediadl/extractarticles.go new file mode 100644 index 0000000..ee7af3a --- /dev/null +++ b/internal/wikipediadl/extractarticles.go @@ -0,0 +1,75 @@ +package wikipediadl + +import ( + "compress/bzip2" + "encoding/xml" + "errors" + "io" + "log" + "net/http" +) + +type WikiArticle struct { + Title string `xml:"title"` + Revision Revision `xml:"revision"` +} + +type Revision struct { + Text string `xml:"text"` +} + +func ExtractArticles(bundle string) ([]WikiArticle, error) { + url := WikipediaDumpUrl + bundle + + resp, err := http.Get(url) + if err != nil { + log.Println(err.Error()) + return nil, errors.New("wikipediadl: failed load articles") + } + + defer resp.Body.Close() + + if resp.StatusCode != 200 { + return nil, errors.New("wikipediadl: bad response status") + } + + bz2Reader := bzip2.NewReader(resp.Body) + xmlDec := xml.NewDecoder(bz2Reader) + + count := 0 + + articles := []WikiArticle{} +Loop: + for { + tok, err := xmlDec.Token() + if err != nil { + if err == io.EOF { + break + } + + return nil, errors.New("XML token error") + } + + switch se := tok.(type) { + case xml.StartElement: + if count == 2 { // XXX: remove later + break Loop + } + + if se.Name.Local != "page" { + continue + } + + var p WikiArticle + if err := xmlDec.DecodeElement(&p, &se); err != nil { + log.Println(err.Error()) + continue + } + + articles = append(articles, p) + count++ // XXX: remove later + } + } + + return articles, nil +} diff --git a/internal/wikipediadl/fetcharticles.go b/internal/wikipediadl/fetcharticles.go index c1cac77..c446c05 100644 --- a/internal/wikipediadl/fetcharticles.go +++ b/internal/wikipediadl/fetcharticles.go @@ -1,7 +1,6 @@ package wikipediadl import ( - "fmt" "log" "strings" @@ -13,12 +12,17 @@ const ( DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/" ) +func FetchArticleBundles() ([]string, error) { + scraper := getScraper() + + articles := getAllArticles(scraper) + return articles, nil +} + func getScraper() *colly.Collector { - s := colly.NewCollector( + return colly.NewCollector( colly.AllowedDomains(DumpDomain), ) - - return s } func getAllArticles(s *colly.Collector) []string { @@ -58,14 +62,3 @@ func isValidArticle(a string) bool { articleIndex := article[0] return articleIndex >= 48 && articleIndex <= 57 } - -func FetchArticles() error { - scraper := getScraper() - - articles := getAllArticles(scraper) - for _, a := range articles { - fmt.Println(a) - } - - return nil -} diff --git a/sqltable/1_articles.sql b/sqltable/1_articles.sql index 89f9137..2ffe654 100644 --- a/sqltable/1_articles.sql +++ b/sqltable/1_articles.sql @@ -1,5 +1,5 @@ CREATE TABLE IF NOT EXISTS articles( - uuid CHAR(36), + uuid CHAR(36) PRIMARY KEY, title VARCHAR(255), content TEXT ) \ No newline at end of file