Added: downloading and returning wikipedia articles
This commit is contained in:
78
api/article/handler.go
Normal file
78
api/article/handler.go
Normal file
@@ -0,0 +1,78 @@
|
||||
package article
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"scrap/api/httpio"
|
||||
"scrap/internal/article"
|
||||
"scrap/internal/db"
|
||||
)
|
||||
|
||||
func ArticleDownloadHandler(w http.ResponseWriter, r *http.Request) {
|
||||
dbInstance := db.GetInstance()
|
||||
txRepo := db.NewTxRepository(dbInstance)
|
||||
articleRepo := article.NewArticleRepository()
|
||||
|
||||
service := article.NewArticleService(txRepo, articleRepo)
|
||||
if err := service.DownloadArticles(); err != nil {
|
||||
switch err {
|
||||
default:
|
||||
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func ArticleQueryHandler(w http.ResponseWriter, r *http.Request) {
|
||||
body, err := httpio.ParseURLQuery[ArticleQueryRequest](
|
||||
r,
|
||||
httpio.URLQueryKey[string]("title"),
|
||||
)
|
||||
if err != nil {
|
||||
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
if httpErr := body.Validate(); httpErr != nil {
|
||||
httpErr.Raise(w)
|
||||
return
|
||||
}
|
||||
|
||||
dbInstance := db.GetInstance()
|
||||
txRepo := db.NewTxRepository(dbInstance)
|
||||
articleRepo := article.NewArticleRepository()
|
||||
|
||||
service := article.NewArticleService(txRepo, articleRepo)
|
||||
|
||||
articleQueryData := article.ArticleQueryDTO{
|
||||
Title: body.Title,
|
||||
}
|
||||
|
||||
articles, err := service.QueryArticles(articleQueryData)
|
||||
if err != nil {
|
||||
switch err {
|
||||
case article.ErrArticleTitleInvalidLength:
|
||||
ErrHttpArticleTitleInvalidLength.Raise(w)
|
||||
default:
|
||||
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
articlesOut := make([]ArticleResponse, 0, len(articles))
|
||||
for _, a := range articles {
|
||||
ar := ArticleResponse{
|
||||
Uuid: a.Uuid,
|
||||
Title: a.Title,
|
||||
Content: a.Content,
|
||||
}
|
||||
|
||||
articlesOut = append(articlesOut, ar)
|
||||
}
|
||||
|
||||
if err = ArticleQueryResponse(articlesOut).Return(w, http.StatusOK); err != nil {
|
||||
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
}
|
||||
14
api/article/httperror.go
Normal file
14
api/article/httperror.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package article
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"scrap/api/httpio"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrHttpArticleTitleInvalidLength = httpio.HTTPError{
|
||||
StatusCode: http.StatusBadRequest,
|
||||
ErrorCode: "ARTICLE_TITLE_LENGTH",
|
||||
Message: "Invalid title length.",
|
||||
}
|
||||
)
|
||||
16
api/article/request.go
Normal file
16
api/article/request.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package article
|
||||
|
||||
import "scrap/api/httpio"
|
||||
|
||||
type ArticleQueryRequest struct {
|
||||
Title string `json:"title"`
|
||||
}
|
||||
|
||||
func (a ArticleQueryRequest) Validate() *httpio.HTTPError {
|
||||
titleLength := len(a.Title)
|
||||
if titleLength < 1 || titleLength > 255 {
|
||||
return &ErrHttpArticleTitleInvalidLength
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
15
api/article/response.go
Normal file
15
api/article/response.go
Normal file
@@ -0,0 +1,15 @@
|
||||
package article
|
||||
|
||||
import "scrap/api/httpio"
|
||||
|
||||
type ArticleResponse struct {
|
||||
Uuid string `json:"uuid"`
|
||||
Title string `json:"title"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
func ArticleQueryResponse(articles []ArticleResponse) httpio.ResponseIO {
|
||||
return httpio.ResponseIO{
|
||||
"articles": articles,
|
||||
}
|
||||
}
|
||||
24
api/httpio/httperror.go
Normal file
24
api/httpio/httperror.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package httpio
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type HTTPError struct {
|
||||
StatusCode int `json:"-"`
|
||||
ErrorCode string `json:"error-code"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
func (h HTTPError) Raise(w http.ResponseWriter) {
|
||||
jsonBytes, _ := json.Marshal(h)
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(h.StatusCode)
|
||||
w.Write(jsonBytes)
|
||||
}
|
||||
|
||||
func RaiseOnlyStatusCode(w http.ResponseWriter, code int) {
|
||||
http.Error(w, "", code)
|
||||
}
|
||||
38
api/httpio/request.go
Normal file
38
api/httpio/request.go
Normal file
@@ -0,0 +1,38 @@
|
||||
package httpio
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type IRequestIO interface {
|
||||
// Validates the received request.
|
||||
Validate() *HTTPError
|
||||
}
|
||||
|
||||
// Parses request body into the provided struct.
|
||||
// Throws an error if the body could not be parsed.
|
||||
func ParseRequestBody[T IRequestIO](r *http.Request) (*T, error) {
|
||||
requestBytes, err := io.ReadAll(r.Body)
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !json.Valid(requestBytes) {
|
||||
return nil, errors.New("invalid JSON format")
|
||||
}
|
||||
|
||||
var req T
|
||||
err = json.Unmarshal(requestBytes, &req)
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &req, nil
|
||||
|
||||
}
|
||||
21
api/httpio/response.go
Normal file
21
api/httpio/response.go
Normal file
@@ -0,0 +1,21 @@
|
||||
package httpio
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type ResponseIO map[string]any
|
||||
|
||||
func (r ResponseIO) Return(w http.ResponseWriter, statusCode int) error {
|
||||
jsonBytes, err := json.Marshal(r)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(statusCode)
|
||||
w.Write(jsonBytes)
|
||||
|
||||
return nil
|
||||
}
|
||||
86
api/httpio/urlquery.go
Normal file
86
api/httpio/urlquery.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package httpio
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"net/http"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
type URLQueryValueType interface {
|
||||
string | int | float32 | float64 | bool
|
||||
}
|
||||
|
||||
type iURLQueryKeyType interface {
|
||||
GetKey() string
|
||||
}
|
||||
|
||||
type URLQueryKeyType[T URLQueryValueType] struct {
|
||||
Key string
|
||||
_ T
|
||||
}
|
||||
|
||||
func (u URLQueryKeyType[T]) GetKey() string { return u.Key }
|
||||
|
||||
func URLQueryKey[T URLQueryValueType](key string) iURLQueryKeyType {
|
||||
return URLQueryKeyType[T]{
|
||||
Key: key,
|
||||
}
|
||||
}
|
||||
|
||||
func ParseURLQuery[T IRequestIO](r *http.Request, keys ...iURLQueryKeyType) (*T, error) {
|
||||
query := make(map[string]any, len(keys))
|
||||
|
||||
for _, key := range keys {
|
||||
queryValue := r.URL.Query().Get(key.GetKey())
|
||||
|
||||
if queryValue == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
switch key.(type) {
|
||||
case URLQueryKeyType[string]:
|
||||
query[key.GetKey()] = queryValue
|
||||
case URLQueryKeyType[int]:
|
||||
x, err := strconv.Atoi(queryValue)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
query[key.GetKey()] = x
|
||||
case URLQueryKeyType[float32]:
|
||||
x, err := strconv.ParseFloat(queryValue, 32)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
query[key.GetKey()] = x
|
||||
case URLQueryKeyType[float64]:
|
||||
x, err := strconv.ParseFloat(queryValue, 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
query[key.GetKey()] = x
|
||||
case URLQueryKeyType[bool]:
|
||||
x, err := strconv.ParseBool(queryValue)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
query[key.GetKey()] = x
|
||||
default:
|
||||
return nil, errors.New("unsupported URL query key type")
|
||||
}
|
||||
}
|
||||
|
||||
queryBytes, _ := json.Marshal(query)
|
||||
|
||||
var req T
|
||||
err := json.Unmarshal(queryBytes, &req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &req, nil
|
||||
}
|
||||
17
api/setup.go
Normal file
17
api/setup.go
Normal file
@@ -0,0 +1,17 @@
|
||||
package api
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"scrap/api/article"
|
||||
|
||||
"github.com/go-chi/chi"
|
||||
)
|
||||
|
||||
func Setup() {
|
||||
r := chi.NewRouter()
|
||||
|
||||
r.Get("/articles", article.ArticleQueryHandler)
|
||||
r.Get("/articles-download", article.ArticleDownloadHandler)
|
||||
|
||||
http.ListenAndServe(":8080", r)
|
||||
}
|
||||
@@ -2,9 +2,9 @@ package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"scrap/api"
|
||||
"scrap/internal/config"
|
||||
"scrap/internal/db"
|
||||
"scrap/internal/wikipediadl"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -13,8 +13,7 @@ func main() {
|
||||
db.Setup()
|
||||
defer db.Close()
|
||||
|
||||
if err := wikipediadl.FetchArticles(); err != nil {
|
||||
log.Println(err.Error())
|
||||
}
|
||||
log.SetFlags(log.Lshortfile)
|
||||
|
||||
api.Setup()
|
||||
}
|
||||
|
||||
2
go.mod
2
go.mod
@@ -3,6 +3,7 @@ module scrap
|
||||
go 1.24.4
|
||||
|
||||
require (
|
||||
github.com/go-chi/chi v1.5.5
|
||||
github.com/gocolly/colly v1.2.0
|
||||
github.com/mattn/go-sqlite3 v1.14.32
|
||||
)
|
||||
@@ -16,6 +17,7 @@ require (
|
||||
github.com/gobwas/glob v0.2.3 // indirect
|
||||
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/kennygrant/sanitize v1.2.4 // indirect
|
||||
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
|
||||
github.com/temoto/robotstxt v1.1.2 // indirect
|
||||
|
||||
4
go.sum
4
go.sum
@@ -11,6 +11,8 @@ github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ=
|
||||
github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
|
||||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE=
|
||||
github.com/go-chi/chi v1.5.5/go.mod h1:C9JqLr3tIYjDOZpzn+BCuxY8z8vmca43EeMgyZt7irw=
|
||||
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
|
||||
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
|
||||
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
|
||||
@@ -26,6 +28,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
|
||||
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
|
||||
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
|
||||
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=
|
||||
|
||||
11
internal/article/dto.go
Normal file
11
internal/article/dto.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package article
|
||||
|
||||
type ArticleDTO struct {
|
||||
Uuid string
|
||||
Title string
|
||||
Content string
|
||||
}
|
||||
|
||||
type ArticleQueryDTO struct {
|
||||
Title string
|
||||
}
|
||||
12
internal/article/error.go
Normal file
12
internal/article/error.go
Normal file
@@ -0,0 +1,12 @@
|
||||
package article
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrArticleDownloadFailed = errors.New("article: download failed")
|
||||
ErrArticleQueryFailed = errors.New("article: article query failed")
|
||||
ErrArticleCreateFailed = errors.New("article: create failed")
|
||||
ErrArticleDeleteAllFailed = errors.New("article: failed to delete all articles")
|
||||
|
||||
ErrArticleTitleInvalidLength = errors.New("article: invalid article length")
|
||||
)
|
||||
9
internal/article/irepository.go
Normal file
9
internal/article/irepository.go
Normal file
@@ -0,0 +1,9 @@
|
||||
package article
|
||||
|
||||
import "database/sql"
|
||||
|
||||
type IArticleRepository interface {
|
||||
CreateArticle(tx *sql.Tx, data ArticleCreateModel) error
|
||||
GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error)
|
||||
DeleteAllArticles(tx *sql.Tx) error
|
||||
}
|
||||
6
internal/article/iservice.go
Normal file
6
internal/article/iservice.go
Normal file
@@ -0,0 +1,6 @@
|
||||
package article
|
||||
|
||||
type IArticleService interface {
|
||||
DownloadArticles() error
|
||||
QueryArticles(ArticleQueryDTO) ([]ArticleDTO, error)
|
||||
}
|
||||
13
internal/article/model.go
Normal file
13
internal/article/model.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package article
|
||||
|
||||
type ArticleModel struct {
|
||||
Uuid string
|
||||
Title string
|
||||
Content string
|
||||
}
|
||||
|
||||
type ArticleCreateModel struct {
|
||||
Uuid string
|
||||
Title string
|
||||
Content string
|
||||
}
|
||||
60
internal/article/repository.go
Normal file
60
internal/article/repository.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package article
|
||||
|
||||
import (
|
||||
"database/sql"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
type ArticleRepository struct{}
|
||||
|
||||
func NewArticleRepository() IArticleRepository {
|
||||
return &ArticleRepository{}
|
||||
}
|
||||
|
||||
func (ArticleRepository) CreateArticle(tx *sql.Tx, data ArticleCreateModel) error {
|
||||
query := `
|
||||
INSERT INTO articles(uuid, title, content)
|
||||
VALUES ($1, $2, $3);
|
||||
`
|
||||
|
||||
_, err := tx.Exec(query, data.Uuid, data.Title, data.Content)
|
||||
return err
|
||||
}
|
||||
|
||||
func (ArticleRepository) GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) {
|
||||
fmt.Println(title, " ------------------")
|
||||
query := `
|
||||
SELECT uuid, title, content
|
||||
FROM articles
|
||||
WHERE title LIKE $1 || '%'
|
||||
LIMIT 10;
|
||||
`
|
||||
|
||||
rows, err := tx.Query(query, title)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
articles := []ArticleModel{}
|
||||
for rows.Next() {
|
||||
var a ArticleModel
|
||||
|
||||
err = rows.Scan(&a.Uuid, &a.Title, &a.Content)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
articles = append(articles, a)
|
||||
}
|
||||
|
||||
fmt.Println(articles)
|
||||
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func (ArticleRepository) DeleteAllArticles(tx *sql.Tx) error {
|
||||
query := `DELETE FROM articles;`
|
||||
|
||||
_, err := tx.Exec(query)
|
||||
return err
|
||||
}
|
||||
107
internal/article/service.go
Normal file
107
internal/article/service.go
Normal file
@@ -0,0 +1,107 @@
|
||||
package article
|
||||
|
||||
import (
|
||||
"log"
|
||||
"scrap/internal/db"
|
||||
"scrap/internal/wikipediadl"
|
||||
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
type ArticleService struct {
|
||||
txRepo db.ITxRepository
|
||||
articleRepo IArticleRepository
|
||||
}
|
||||
|
||||
func NewArticleService(
|
||||
txRepo db.ITxRepository,
|
||||
articleRepo IArticleRepository,
|
||||
) IArticleService {
|
||||
return &ArticleService{
|
||||
txRepo: txRepo,
|
||||
articleRepo: articleRepo,
|
||||
}
|
||||
}
|
||||
|
||||
func (a ArticleService) QueryArticles(data ArticleQueryDTO) ([]ArticleDTO, error) {
|
||||
tx, err := a.txRepo.Begin()
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer a.txRepo.RollbackOnError(tx, &err)
|
||||
|
||||
articleTitleLength := len(data.Title)
|
||||
if articleTitleLength < 1 || articleTitleLength > 255 {
|
||||
return nil, ErrArticleTitleInvalidLength
|
||||
}
|
||||
|
||||
articles, err := a.articleRepo.GetArticlesByTitle(tx, data.Title)
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return nil, ErrArticleQueryFailed
|
||||
}
|
||||
|
||||
articlesOut := make([]ArticleDTO, 0, len(articles))
|
||||
for _, am := range articles {
|
||||
a := ArticleDTO{
|
||||
Uuid: am.Uuid,
|
||||
Title: am.Title,
|
||||
Content: am.Content,
|
||||
}
|
||||
|
||||
articlesOut = append(articlesOut, a)
|
||||
}
|
||||
|
||||
return articlesOut, nil
|
||||
}
|
||||
|
||||
func (a ArticleService) DownloadArticles() error {
|
||||
tx, err := a.txRepo.Begin()
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return db.ErrTxBeginFailed
|
||||
}
|
||||
|
||||
defer a.txRepo.RollbackOnError(tx, &err)
|
||||
|
||||
if err = a.articleRepo.DeleteAllArticles(tx); err != nil {
|
||||
log.Println(err.Error())
|
||||
return ErrArticleDeleteAllFailed
|
||||
}
|
||||
|
||||
articleBundles, err := wikipediadl.FetchArticleBundles()
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return ErrArticleDownloadFailed
|
||||
}
|
||||
|
||||
for _, ab := range articleBundles {
|
||||
articles, err := wikipediadl.ExtractArticles(ab)
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return ErrArticleDownloadFailed
|
||||
}
|
||||
|
||||
for _, article := range articles {
|
||||
articleData := ArticleCreateModel{
|
||||
Uuid: uuid.NewString(),
|
||||
Title: article.Title,
|
||||
Content: article.Revision.Text,
|
||||
}
|
||||
|
||||
if err = a.articleRepo.CreateArticle(tx, articleData); err != nil {
|
||||
log.Println(err.Error(), "tutaj ---------")
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err = a.txRepo.Commit(tx); err != nil {
|
||||
log.Println(err.Error())
|
||||
return db.ErrTxCommitFailed
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
8
internal/db/error.go
Normal file
8
internal/db/error.go
Normal file
@@ -0,0 +1,8 @@
|
||||
package db
|
||||
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrTxBeginFailed = errors.New("tx: could not begin a Tx")
|
||||
ErrTxCommitFailed = errors.New("tx: could not commit the Tx")
|
||||
)
|
||||
6
internal/wikipediadl/const.go
Normal file
6
internal/wikipediadl/const.go
Normal file
@@ -0,0 +1,6 @@
|
||||
package wikipediadl
|
||||
|
||||
const (
|
||||
WikipediaDumpDomain = "dumps.wikimedia.org"
|
||||
WikipediaDumpUrl = "https://" + WikipediaDumpDomain + "/plwiki/latest/"
|
||||
)
|
||||
@@ -1,79 +0,0 @@
|
||||
package wikipediadl
|
||||
|
||||
import (
|
||||
"compress/bzip2"
|
||||
"encoding/xml"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type Page struct {
|
||||
Title string `xml:"title"`
|
||||
Revision Revision `xml:"revision"`
|
||||
}
|
||||
|
||||
type Revision struct {
|
||||
Text string `xml:"text"`
|
||||
}
|
||||
|
||||
func DownloadArticles() {
|
||||
url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2"
|
||||
|
||||
limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all")
|
||||
flag.Parse()
|
||||
|
||||
log.Printf("Downloading chunk: %s", url)
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to download chunk: %v", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
|
||||
log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body))
|
||||
}
|
||||
|
||||
bz2Reader := bzip2.NewReader(resp.Body)
|
||||
dec := xml.NewDecoder(bz2Reader)
|
||||
|
||||
count := 0
|
||||
for {
|
||||
tok, err := dec.Token()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
log.Println("Reached end of chunk")
|
||||
break
|
||||
}
|
||||
log.Fatalf("XML token error: %v", err)
|
||||
}
|
||||
|
||||
switch se := tok.(type) {
|
||||
case xml.StartElement:
|
||||
if se.Name.Local == "page" {
|
||||
var p Page
|
||||
if err := dec.DecodeElement(&p, &se); err != nil {
|
||||
log.Printf("Error decoding page: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
count++
|
||||
fmt.Printf("---- Article %d ----\n", count)
|
||||
fmt.Printf("Title: %s\n", p.Title)
|
||||
// fmt.Println("Content:")
|
||||
// fmt.Println(p.Revision.Text)
|
||||
fmt.Println("--------------------\n")
|
||||
|
||||
if *limiter > 0 && count >= *limiter {
|
||||
log.Printf("Reached limit of %d articles, stopping.", *limiter)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("Done. Total articles processed: %d", count)
|
||||
}
|
||||
@@ -3,5 +3,6 @@ package wikipediadl
|
||||
import "errors"
|
||||
|
||||
var (
|
||||
ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles")
|
||||
ErrArticleBundleFetchFailed = errors.New("wikipediadl: failed to fetch article bundles")
|
||||
ErrArticleDownloadFailed = errors.New("wikipediadl: failed to extract articles")
|
||||
)
|
||||
|
||||
75
internal/wikipediadl/extractarticles.go
Normal file
75
internal/wikipediadl/extractarticles.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package wikipediadl
|
||||
|
||||
import (
|
||||
"compress/bzip2"
|
||||
"encoding/xml"
|
||||
"errors"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type WikiArticle struct {
|
||||
Title string `xml:"title"`
|
||||
Revision Revision `xml:"revision"`
|
||||
}
|
||||
|
||||
type Revision struct {
|
||||
Text string `xml:"text"`
|
||||
}
|
||||
|
||||
func ExtractArticles(bundle string) ([]WikiArticle, error) {
|
||||
url := WikipediaDumpUrl + bundle
|
||||
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
log.Println(err.Error())
|
||||
return nil, errors.New("wikipediadl: failed load articles")
|
||||
}
|
||||
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, errors.New("wikipediadl: bad response status")
|
||||
}
|
||||
|
||||
bz2Reader := bzip2.NewReader(resp.Body)
|
||||
xmlDec := xml.NewDecoder(bz2Reader)
|
||||
|
||||
count := 0
|
||||
|
||||
articles := []WikiArticle{}
|
||||
Loop:
|
||||
for {
|
||||
tok, err := xmlDec.Token()
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
|
||||
return nil, errors.New("XML token error")
|
||||
}
|
||||
|
||||
switch se := tok.(type) {
|
||||
case xml.StartElement:
|
||||
if count == 2 { // XXX: remove later
|
||||
break Loop
|
||||
}
|
||||
|
||||
if se.Name.Local != "page" {
|
||||
continue
|
||||
}
|
||||
|
||||
var p WikiArticle
|
||||
if err := xmlDec.DecodeElement(&p, &se); err != nil {
|
||||
log.Println(err.Error())
|
||||
continue
|
||||
}
|
||||
|
||||
articles = append(articles, p)
|
||||
count++ // XXX: remove later
|
||||
}
|
||||
}
|
||||
|
||||
return articles, nil
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
package wikipediadl
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
|
||||
@@ -13,12 +12,17 @@ const (
|
||||
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
|
||||
)
|
||||
|
||||
func FetchArticleBundles() ([]string, error) {
|
||||
scraper := getScraper()
|
||||
|
||||
articles := getAllArticles(scraper)
|
||||
return articles, nil
|
||||
}
|
||||
|
||||
func getScraper() *colly.Collector {
|
||||
s := colly.NewCollector(
|
||||
return colly.NewCollector(
|
||||
colly.AllowedDomains(DumpDomain),
|
||||
)
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func getAllArticles(s *colly.Collector) []string {
|
||||
@@ -58,14 +62,3 @@ func isValidArticle(a string) bool {
|
||||
articleIndex := article[0]
|
||||
return articleIndex >= 48 && articleIndex <= 57
|
||||
}
|
||||
|
||||
func FetchArticles() error {
|
||||
scraper := getScraper()
|
||||
|
||||
articles := getAllArticles(scraper)
|
||||
for _, a := range articles {
|
||||
fmt.Println(a)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
CREATE TABLE IF NOT EXISTS articles(
|
||||
uuid CHAR(36),
|
||||
uuid CHAR(36) PRIMARY KEY,
|
||||
title VARCHAR(255),
|
||||
content TEXT
|
||||
)
|
||||
Reference in New Issue
Block a user