dev #1

Merged
jumpiniasty merged 5 commits from dev into main 2025-10-05 08:09:47 +00:00
26 changed files with 636 additions and 100 deletions
Showing only changes of commit 6df63dc4c1 - Show all commits

78
api/article/handler.go Normal file
View File

@@ -0,0 +1,78 @@
package article
import (
"net/http"
"scrap/api/httpio"
"scrap/internal/article"
"scrap/internal/db"
)
func ArticleDownloadHandler(w http.ResponseWriter, r *http.Request) {
dbInstance := db.GetInstance()
txRepo := db.NewTxRepository(dbInstance)
articleRepo := article.NewArticleRepository()
service := article.NewArticleService(txRepo, articleRepo)
if err := service.DownloadArticles(); err != nil {
switch err {
default:
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
}
return
}
}
func ArticleQueryHandler(w http.ResponseWriter, r *http.Request) {
body, err := httpio.ParseURLQuery[ArticleQueryRequest](
r,
httpio.URLQueryKey[string]("title"),
)
if err != nil {
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
return
}
if httpErr := body.Validate(); httpErr != nil {
httpErr.Raise(w)
return
}
dbInstance := db.GetInstance()
txRepo := db.NewTxRepository(dbInstance)
articleRepo := article.NewArticleRepository()
service := article.NewArticleService(txRepo, articleRepo)
articleQueryData := article.ArticleQueryDTO{
Title: body.Title,
}
articles, err := service.QueryArticles(articleQueryData)
if err != nil {
switch err {
case article.ErrArticleTitleInvalidLength:
ErrHttpArticleTitleInvalidLength.Raise(w)
default:
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
}
return
}
articlesOut := make([]ArticleResponse, 0, len(articles))
for _, a := range articles {
ar := ArticleResponse{
Uuid: a.Uuid,
Title: a.Title,
Content: a.Content,
}
articlesOut = append(articlesOut, ar)
}
if err = ArticleQueryResponse(articlesOut).Return(w, http.StatusOK); err != nil {
httpio.RaiseOnlyStatusCode(w, http.StatusInternalServerError)
return
}
}

14
api/article/httperror.go Normal file
View File

@@ -0,0 +1,14 @@
package article
import (
"net/http"
"scrap/api/httpio"
)
var (
ErrHttpArticleTitleInvalidLength = httpio.HTTPError{
StatusCode: http.StatusBadRequest,
ErrorCode: "ARTICLE_TITLE_LENGTH",
Message: "Invalid title length.",
}
)

16
api/article/request.go Normal file
View File

@@ -0,0 +1,16 @@
package article
import "scrap/api/httpio"
type ArticleQueryRequest struct {
Title string `json:"title"`
}
func (a ArticleQueryRequest) Validate() *httpio.HTTPError {
titleLength := len(a.Title)
if titleLength < 1 || titleLength > 255 {
return &ErrHttpArticleTitleInvalidLength
}
return nil
}

15
api/article/response.go Normal file
View File

@@ -0,0 +1,15 @@
package article
import "scrap/api/httpio"
type ArticleResponse struct {
Uuid string `json:"uuid"`
Title string `json:"title"`
Content string `json:"content"`
}
func ArticleQueryResponse(articles []ArticleResponse) httpio.ResponseIO {
return httpio.ResponseIO{
"articles": articles,
}
}

24
api/httpio/httperror.go Normal file
View File

@@ -0,0 +1,24 @@
package httpio
import (
"encoding/json"
"net/http"
)
type HTTPError struct {
StatusCode int `json:"-"`
ErrorCode string `json:"error-code"`
Message string `json:"message"`
}
func (h HTTPError) Raise(w http.ResponseWriter) {
jsonBytes, _ := json.Marshal(h)
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(h.StatusCode)
w.Write(jsonBytes)
}
func RaiseOnlyStatusCode(w http.ResponseWriter, code int) {
http.Error(w, "", code)
}

38
api/httpio/request.go Normal file
View File

@@ -0,0 +1,38 @@
package httpio
import (
"encoding/json"
"errors"
"io"
"log"
"net/http"
)
type IRequestIO interface {
// Validates the received request.
Validate() *HTTPError
}
// Parses request body into the provided struct.
// Throws an error if the body could not be parsed.
func ParseRequestBody[T IRequestIO](r *http.Request) (*T, error) {
requestBytes, err := io.ReadAll(r.Body)
if err != nil {
log.Println(err.Error())
return nil, err
}
if !json.Valid(requestBytes) {
return nil, errors.New("invalid JSON format")
}
var req T
err = json.Unmarshal(requestBytes, &req)
if err != nil {
log.Println(err.Error())
return nil, err
}
return &req, nil
}

21
api/httpio/response.go Normal file
View File

@@ -0,0 +1,21 @@
package httpio
import (
"encoding/json"
"net/http"
)
type ResponseIO map[string]any
func (r ResponseIO) Return(w http.ResponseWriter, statusCode int) error {
jsonBytes, err := json.Marshal(r)
if err != nil {
return err
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(statusCode)
w.Write(jsonBytes)
return nil
}

86
api/httpio/urlquery.go Normal file
View File

@@ -0,0 +1,86 @@
package httpio
import (
"encoding/json"
"errors"
"net/http"
"strconv"
)
type URLQueryValueType interface {
string | int | float32 | float64 | bool
}
type iURLQueryKeyType interface {
GetKey() string
}
type URLQueryKeyType[T URLQueryValueType] struct {
Key string
_ T
}
func (u URLQueryKeyType[T]) GetKey() string { return u.Key }
func URLQueryKey[T URLQueryValueType](key string) iURLQueryKeyType {
return URLQueryKeyType[T]{
Key: key,
}
}
func ParseURLQuery[T IRequestIO](r *http.Request, keys ...iURLQueryKeyType) (*T, error) {
query := make(map[string]any, len(keys))
for _, key := range keys {
queryValue := r.URL.Query().Get(key.GetKey())
if queryValue == "" {
continue
}
switch key.(type) {
case URLQueryKeyType[string]:
query[key.GetKey()] = queryValue
case URLQueryKeyType[int]:
x, err := strconv.Atoi(queryValue)
if err != nil {
return nil, err
}
query[key.GetKey()] = x
case URLQueryKeyType[float32]:
x, err := strconv.ParseFloat(queryValue, 32)
if err != nil {
return nil, err
}
query[key.GetKey()] = x
case URLQueryKeyType[float64]:
x, err := strconv.ParseFloat(queryValue, 64)
if err != nil {
return nil, err
}
query[key.GetKey()] = x
case URLQueryKeyType[bool]:
x, err := strconv.ParseBool(queryValue)
if err != nil {
return nil, err
}
query[key.GetKey()] = x
default:
return nil, errors.New("unsupported URL query key type")
}
}
queryBytes, _ := json.Marshal(query)
var req T
err := json.Unmarshal(queryBytes, &req)
if err != nil {
return nil, err
}
return &req, nil
}

17
api/setup.go Normal file
View File

@@ -0,0 +1,17 @@
package api
import (
"net/http"
"scrap/api/article"
"github.com/go-chi/chi"
)
func Setup() {
r := chi.NewRouter()
r.Get("/articles", article.ArticleQueryHandler)
r.Get("/articles-download", article.ArticleDownloadHandler)
http.ListenAndServe(":8080", r)
}

View File

@@ -2,9 +2,9 @@ package main
import (
"log"
"scrap/api"
"scrap/internal/config"
"scrap/internal/db"
"scrap/internal/wikipediadl"
)
func main() {
@@ -13,8 +13,7 @@ func main() {
db.Setup()
defer db.Close()
if err := wikipediadl.FetchArticles(); err != nil {
log.Println(err.Error())
}
log.SetFlags(log.Lshortfile)
api.Setup()
}

2
go.mod
View File

@@ -3,6 +3,7 @@ module scrap
go 1.24.4
require (
github.com/go-chi/chi v1.5.5
github.com/gocolly/colly v1.2.0
github.com/mattn/go-sqlite3 v1.14.32
)
@@ -16,6 +17,7 @@ require (
github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/uuid v1.6.0
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/temoto/robotstxt v1.1.2 // indirect

4
go.sum
View File

@@ -11,6 +11,8 @@ github.com/antchfx/xpath v1.3.5 h1:PqbXLC3TkfeZyakF5eeh3NTWEbYl4VHNVeufANzDbKQ=
github.com/antchfx/xpath v1.3.5/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-chi/chi v1.5.5 h1:vOB/HbEMt9QqBqErz07QehcOKHaWFtuj87tTDVz2qXE=
github.com/go-chi/chi v1.5.5/go.mod h1:C9JqLr3tIYjDOZpzn+BCuxY8z8vmca43EeMgyZt7irw=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
github.com/gocolly/colly v1.2.0 h1:qRz9YAn8FIH0qzgNUw+HT9UN7wm1oF9OBAilwEWpyrI=
@@ -26,6 +28,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
github.com/mattn/go-sqlite3 v1.14.32 h1:JD12Ag3oLy1zQA+BNn74xRgaBbdhbNIDYvQUEuuErjs=

11
internal/article/dto.go Normal file
View File

@@ -0,0 +1,11 @@
package article
type ArticleDTO struct {
Uuid string
Title string
Content string
}
type ArticleQueryDTO struct {
Title string
}

12
internal/article/error.go Normal file
View File

@@ -0,0 +1,12 @@
package article
import "errors"
var (
ErrArticleDownloadFailed = errors.New("article: download failed")
ErrArticleQueryFailed = errors.New("article: article query failed")
ErrArticleCreateFailed = errors.New("article: create failed")
ErrArticleDeleteAllFailed = errors.New("article: failed to delete all articles")
ErrArticleTitleInvalidLength = errors.New("article: invalid article length")
)

View File

@@ -0,0 +1,9 @@
package article
import "database/sql"
type IArticleRepository interface {
CreateArticle(tx *sql.Tx, data ArticleCreateModel) error
GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error)
DeleteAllArticles(tx *sql.Tx) error
}

View File

@@ -0,0 +1,6 @@
package article
type IArticleService interface {
DownloadArticles() error
QueryArticles(ArticleQueryDTO) ([]ArticleDTO, error)
}

13
internal/article/model.go Normal file
View File

@@ -0,0 +1,13 @@
package article
type ArticleModel struct {
Uuid string
Title string
Content string
}
type ArticleCreateModel struct {
Uuid string
Title string
Content string
}

View File

@@ -0,0 +1,60 @@
package article
import (
"database/sql"
"fmt"
)
type ArticleRepository struct{}
func NewArticleRepository() IArticleRepository {
return &ArticleRepository{}
}
func (ArticleRepository) CreateArticle(tx *sql.Tx, data ArticleCreateModel) error {
query := `
INSERT INTO articles(uuid, title, content)
VALUES ($1, $2, $3);
`
_, err := tx.Exec(query, data.Uuid, data.Title, data.Content)
return err
}
func (ArticleRepository) GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) {
fmt.Println(title, " ------------------")
query := `
SELECT uuid, title, content
FROM articles
WHERE title LIKE $1 || '%'
LIMIT 10;
`
rows, err := tx.Query(query, title)
if err != nil {
return nil, err
}
articles := []ArticleModel{}
for rows.Next() {
var a ArticleModel
err = rows.Scan(&a.Uuid, &a.Title, &a.Content)
if err != nil {
return nil, err
}
articles = append(articles, a)
}
fmt.Println(articles)
return articles, nil
}
func (ArticleRepository) DeleteAllArticles(tx *sql.Tx) error {
query := `DELETE FROM articles;`
_, err := tx.Exec(query)
return err
}

107
internal/article/service.go Normal file
View File

@@ -0,0 +1,107 @@
package article
import (
"log"
"scrap/internal/db"
"scrap/internal/wikipediadl"
"github.com/google/uuid"
)
type ArticleService struct {
txRepo db.ITxRepository
articleRepo IArticleRepository
}
func NewArticleService(
txRepo db.ITxRepository,
articleRepo IArticleRepository,
) IArticleService {
return &ArticleService{
txRepo: txRepo,
articleRepo: articleRepo,
}
}
func (a ArticleService) QueryArticles(data ArticleQueryDTO) ([]ArticleDTO, error) {
tx, err := a.txRepo.Begin()
if err != nil {
log.Println(err.Error())
return nil, err
}
defer a.txRepo.RollbackOnError(tx, &err)
articleTitleLength := len(data.Title)
if articleTitleLength < 1 || articleTitleLength > 255 {
return nil, ErrArticleTitleInvalidLength
}
articles, err := a.articleRepo.GetArticlesByTitle(tx, data.Title)
if err != nil {
log.Println(err.Error())
return nil, ErrArticleQueryFailed
}
articlesOut := make([]ArticleDTO, 0, len(articles))
for _, am := range articles {
a := ArticleDTO{
Uuid: am.Uuid,
Title: am.Title,
Content: am.Content,
}
articlesOut = append(articlesOut, a)
}
return articlesOut, nil
}
func (a ArticleService) DownloadArticles() error {
tx, err := a.txRepo.Begin()
if err != nil {
log.Println(err.Error())
return db.ErrTxBeginFailed
}
defer a.txRepo.RollbackOnError(tx, &err)
if err = a.articleRepo.DeleteAllArticles(tx); err != nil {
log.Println(err.Error())
return ErrArticleDeleteAllFailed
}
articleBundles, err := wikipediadl.FetchArticleBundles()
if err != nil {
log.Println(err.Error())
return ErrArticleDownloadFailed
}
for _, ab := range articleBundles {
articles, err := wikipediadl.ExtractArticles(ab)
if err != nil {
log.Println(err.Error())
return ErrArticleDownloadFailed
}
for _, article := range articles {
articleData := ArticleCreateModel{
Uuid: uuid.NewString(),
Title: article.Title,
Content: article.Revision.Text,
}
if err = a.articleRepo.CreateArticle(tx, articleData); err != nil {
log.Println(err.Error(), "tutaj ---------")
continue
}
}
}
if err = a.txRepo.Commit(tx); err != nil {
log.Println(err.Error())
return db.ErrTxCommitFailed
}
return nil
}

8
internal/db/error.go Normal file
View File

@@ -0,0 +1,8 @@
package db
import "errors"
var (
ErrTxBeginFailed = errors.New("tx: could not begin a Tx")
ErrTxCommitFailed = errors.New("tx: could not commit the Tx")
)

View File

@@ -0,0 +1,6 @@
package wikipediadl
const (
WikipediaDumpDomain = "dumps.wikimedia.org"
WikipediaDumpUrl = "https://" + WikipediaDumpDomain + "/plwiki/latest/"
)

View File

@@ -1,79 +0,0 @@
package wikipediadl
import (
"compress/bzip2"
"encoding/xml"
"flag"
"fmt"
"io"
"log"
"net/http"
)
type Page struct {
Title string `xml:"title"`
Revision Revision `xml:"revision"`
}
type Revision struct {
Text string `xml:"text"`
}
func DownloadArticles() {
url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2"
limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all")
flag.Parse()
log.Printf("Downloading chunk: %s", url)
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Failed to download chunk: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body))
}
bz2Reader := bzip2.NewReader(resp.Body)
dec := xml.NewDecoder(bz2Reader)
count := 0
for {
tok, err := dec.Token()
if err != nil {
if err == io.EOF {
log.Println("Reached end of chunk")
break
}
log.Fatalf("XML token error: %v", err)
}
switch se := tok.(type) {
case xml.StartElement:
if se.Name.Local == "page" {
var p Page
if err := dec.DecodeElement(&p, &se); err != nil {
log.Printf("Error decoding page: %v", err)
continue
}
count++
fmt.Printf("---- Article %d ----\n", count)
fmt.Printf("Title: %s\n", p.Title)
// fmt.Println("Content:")
// fmt.Println(p.Revision.Text)
fmt.Println("--------------------\n")
if *limiter > 0 && count >= *limiter {
log.Printf("Reached limit of %d articles, stopping.", *limiter)
return
}
}
}
}
log.Printf("Done. Total articles processed: %d", count)
}

View File

@@ -3,5 +3,6 @@ package wikipediadl
import "errors"
var (
ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles")
ErrArticleBundleFetchFailed = errors.New("wikipediadl: failed to fetch article bundles")
ErrArticleDownloadFailed = errors.New("wikipediadl: failed to extract articles")
)

View File

@@ -0,0 +1,75 @@
package wikipediadl
import (
"compress/bzip2"
"encoding/xml"
"errors"
"io"
"log"
"net/http"
)
type WikiArticle struct {
Title string `xml:"title"`
Revision Revision `xml:"revision"`
}
type Revision struct {
Text string `xml:"text"`
}
func ExtractArticles(bundle string) ([]WikiArticle, error) {
url := WikipediaDumpUrl + bundle
resp, err := http.Get(url)
if err != nil {
log.Println(err.Error())
return nil, errors.New("wikipediadl: failed load articles")
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, errors.New("wikipediadl: bad response status")
}
bz2Reader := bzip2.NewReader(resp.Body)
xmlDec := xml.NewDecoder(bz2Reader)
count := 0
articles := []WikiArticle{}
Loop:
for {
tok, err := xmlDec.Token()
if err != nil {
if err == io.EOF {
break
}
return nil, errors.New("XML token error")
}
switch se := tok.(type) {
case xml.StartElement:
if count == 2 { // XXX: remove later
break Loop
}
if se.Name.Local != "page" {
continue
}
var p WikiArticle
if err := xmlDec.DecodeElement(&p, &se); err != nil {
log.Println(err.Error())
continue
}
articles = append(articles, p)
count++ // XXX: remove later
}
}
return articles, nil
}

View File

@@ -1,7 +1,6 @@
package wikipediadl
import (
"fmt"
"log"
"strings"
@@ -13,12 +12,17 @@ const (
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
)
func FetchArticleBundles() ([]string, error) {
scraper := getScraper()
articles := getAllArticles(scraper)
return articles, nil
}
func getScraper() *colly.Collector {
s := colly.NewCollector(
return colly.NewCollector(
colly.AllowedDomains(DumpDomain),
)
return s
}
func getAllArticles(s *colly.Collector) []string {
@@ -58,14 +62,3 @@ func isValidArticle(a string) bool {
articleIndex := article[0]
return articleIndex >= 48 && articleIndex <= 57
}
func FetchArticles() error {
scraper := getScraper()
articles := getAllArticles(scraper)
for _, a := range articles {
fmt.Println(a)
}
return nil
}

View File

@@ -1,5 +1,5 @@
CREATE TABLE IF NOT EXISTS articles(
uuid CHAR(36),
uuid CHAR(36) PRIMARY KEY,
title VARCHAR(255),
content TEXT
)