Added: downloading and returning wikipedia articles

This commit is contained in:
Oliwier Adamczyk
2025-10-04 23:14:41 +02:00
parent f542f01b49
commit 6df63dc4c1
26 changed files with 636 additions and 100 deletions

11
internal/article/dto.go Normal file
View File

@@ -0,0 +1,11 @@
package article
type ArticleDTO struct {
Uuid string
Title string
Content string
}
type ArticleQueryDTO struct {
Title string
}

12
internal/article/error.go Normal file
View File

@@ -0,0 +1,12 @@
package article
import "errors"
var (
ErrArticleDownloadFailed = errors.New("article: download failed")
ErrArticleQueryFailed = errors.New("article: article query failed")
ErrArticleCreateFailed = errors.New("article: create failed")
ErrArticleDeleteAllFailed = errors.New("article: failed to delete all articles")
ErrArticleTitleInvalidLength = errors.New("article: invalid article length")
)

View File

@@ -0,0 +1,9 @@
package article
import "database/sql"
type IArticleRepository interface {
CreateArticle(tx *sql.Tx, data ArticleCreateModel) error
GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error)
DeleteAllArticles(tx *sql.Tx) error
}

View File

@@ -0,0 +1,6 @@
package article
type IArticleService interface {
DownloadArticles() error
QueryArticles(ArticleQueryDTO) ([]ArticleDTO, error)
}

13
internal/article/model.go Normal file
View File

@@ -0,0 +1,13 @@
package article
type ArticleModel struct {
Uuid string
Title string
Content string
}
type ArticleCreateModel struct {
Uuid string
Title string
Content string
}

View File

@@ -0,0 +1,60 @@
package article
import (
"database/sql"
"fmt"
)
type ArticleRepository struct{}
func NewArticleRepository() IArticleRepository {
return &ArticleRepository{}
}
func (ArticleRepository) CreateArticle(tx *sql.Tx, data ArticleCreateModel) error {
query := `
INSERT INTO articles(uuid, title, content)
VALUES ($1, $2, $3);
`
_, err := tx.Exec(query, data.Uuid, data.Title, data.Content)
return err
}
func (ArticleRepository) GetArticlesByTitle(tx *sql.Tx, title string) ([]ArticleModel, error) {
fmt.Println(title, " ------------------")
query := `
SELECT uuid, title, content
FROM articles
WHERE title LIKE $1 || '%'
LIMIT 10;
`
rows, err := tx.Query(query, title)
if err != nil {
return nil, err
}
articles := []ArticleModel{}
for rows.Next() {
var a ArticleModel
err = rows.Scan(&a.Uuid, &a.Title, &a.Content)
if err != nil {
return nil, err
}
articles = append(articles, a)
}
fmt.Println(articles)
return articles, nil
}
func (ArticleRepository) DeleteAllArticles(tx *sql.Tx) error {
query := `DELETE FROM articles;`
_, err := tx.Exec(query)
return err
}

107
internal/article/service.go Normal file
View File

@@ -0,0 +1,107 @@
package article
import (
"log"
"scrap/internal/db"
"scrap/internal/wikipediadl"
"github.com/google/uuid"
)
type ArticleService struct {
txRepo db.ITxRepository
articleRepo IArticleRepository
}
func NewArticleService(
txRepo db.ITxRepository,
articleRepo IArticleRepository,
) IArticleService {
return &ArticleService{
txRepo: txRepo,
articleRepo: articleRepo,
}
}
func (a ArticleService) QueryArticles(data ArticleQueryDTO) ([]ArticleDTO, error) {
tx, err := a.txRepo.Begin()
if err != nil {
log.Println(err.Error())
return nil, err
}
defer a.txRepo.RollbackOnError(tx, &err)
articleTitleLength := len(data.Title)
if articleTitleLength < 1 || articleTitleLength > 255 {
return nil, ErrArticleTitleInvalidLength
}
articles, err := a.articleRepo.GetArticlesByTitle(tx, data.Title)
if err != nil {
log.Println(err.Error())
return nil, ErrArticleQueryFailed
}
articlesOut := make([]ArticleDTO, 0, len(articles))
for _, am := range articles {
a := ArticleDTO{
Uuid: am.Uuid,
Title: am.Title,
Content: am.Content,
}
articlesOut = append(articlesOut, a)
}
return articlesOut, nil
}
func (a ArticleService) DownloadArticles() error {
tx, err := a.txRepo.Begin()
if err != nil {
log.Println(err.Error())
return db.ErrTxBeginFailed
}
defer a.txRepo.RollbackOnError(tx, &err)
if err = a.articleRepo.DeleteAllArticles(tx); err != nil {
log.Println(err.Error())
return ErrArticleDeleteAllFailed
}
articleBundles, err := wikipediadl.FetchArticleBundles()
if err != nil {
log.Println(err.Error())
return ErrArticleDownloadFailed
}
for _, ab := range articleBundles {
articles, err := wikipediadl.ExtractArticles(ab)
if err != nil {
log.Println(err.Error())
return ErrArticleDownloadFailed
}
for _, article := range articles {
articleData := ArticleCreateModel{
Uuid: uuid.NewString(),
Title: article.Title,
Content: article.Revision.Text,
}
if err = a.articleRepo.CreateArticle(tx, articleData); err != nil {
log.Println(err.Error(), "tutaj ---------")
continue
}
}
}
if err = a.txRepo.Commit(tx); err != nil {
log.Println(err.Error())
return db.ErrTxCommitFailed
}
return nil
}

8
internal/db/error.go Normal file
View File

@@ -0,0 +1,8 @@
package db
import "errors"
var (
ErrTxBeginFailed = errors.New("tx: could not begin a Tx")
ErrTxCommitFailed = errors.New("tx: could not commit the Tx")
)

View File

@@ -0,0 +1,6 @@
package wikipediadl
const (
WikipediaDumpDomain = "dumps.wikimedia.org"
WikipediaDumpUrl = "https://" + WikipediaDumpDomain + "/plwiki/latest/"
)

View File

@@ -1,79 +0,0 @@
package wikipediadl
import (
"compress/bzip2"
"encoding/xml"
"flag"
"fmt"
"io"
"log"
"net/http"
)
type Page struct {
Title string `xml:"title"`
Revision Revision `xml:"revision"`
}
type Revision struct {
Text string `xml:"text"`
}
func DownloadArticles() {
url := "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles1.xml-p1p187037.bz2"
limiter := flag.Int("limit", 10, "Number of articles to process; 0 means all")
flag.Parse()
log.Printf("Downloading chunk: %s", url)
resp, err := http.Get(url)
if err != nil {
log.Fatalf("Failed to download chunk: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
log.Fatalf("Bad response status: %s\nBody: %s", resp.Status, string(body))
}
bz2Reader := bzip2.NewReader(resp.Body)
dec := xml.NewDecoder(bz2Reader)
count := 0
for {
tok, err := dec.Token()
if err != nil {
if err == io.EOF {
log.Println("Reached end of chunk")
break
}
log.Fatalf("XML token error: %v", err)
}
switch se := tok.(type) {
case xml.StartElement:
if se.Name.Local == "page" {
var p Page
if err := dec.DecodeElement(&p, &se); err != nil {
log.Printf("Error decoding page: %v", err)
continue
}
count++
fmt.Printf("---- Article %d ----\n", count)
fmt.Printf("Title: %s\n", p.Title)
// fmt.Println("Content:")
// fmt.Println(p.Revision.Text)
fmt.Println("--------------------\n")
if *limiter > 0 && count >= *limiter {
log.Printf("Reached limit of %d articles, stopping.", *limiter)
return
}
}
}
}
log.Printf("Done. Total articles processed: %d", count)
}

View File

@@ -3,5 +3,6 @@ package wikipediadl
import "errors"
var (
ErrArticleFetchFailed = errors.New("wikipediadl: failed to fetch articles")
ErrArticleBundleFetchFailed = errors.New("wikipediadl: failed to fetch article bundles")
ErrArticleDownloadFailed = errors.New("wikipediadl: failed to extract articles")
)

View File

@@ -0,0 +1,75 @@
package wikipediadl
import (
"compress/bzip2"
"encoding/xml"
"errors"
"io"
"log"
"net/http"
)
type WikiArticle struct {
Title string `xml:"title"`
Revision Revision `xml:"revision"`
}
type Revision struct {
Text string `xml:"text"`
}
func ExtractArticles(bundle string) ([]WikiArticle, error) {
url := WikipediaDumpUrl + bundle
resp, err := http.Get(url)
if err != nil {
log.Println(err.Error())
return nil, errors.New("wikipediadl: failed load articles")
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return nil, errors.New("wikipediadl: bad response status")
}
bz2Reader := bzip2.NewReader(resp.Body)
xmlDec := xml.NewDecoder(bz2Reader)
count := 0
articles := []WikiArticle{}
Loop:
for {
tok, err := xmlDec.Token()
if err != nil {
if err == io.EOF {
break
}
return nil, errors.New("XML token error")
}
switch se := tok.(type) {
case xml.StartElement:
if count == 2 { // XXX: remove later
break Loop
}
if se.Name.Local != "page" {
continue
}
var p WikiArticle
if err := xmlDec.DecodeElement(&p, &se); err != nil {
log.Println(err.Error())
continue
}
articles = append(articles, p)
count++ // XXX: remove later
}
}
return articles, nil
}

View File

@@ -1,7 +1,6 @@
package wikipediadl
import (
"fmt"
"log"
"strings"
@@ -13,12 +12,17 @@ const (
DumpUrl = "https://dumps.wikimedia.org/plwiki/latest/"
)
func FetchArticleBundles() ([]string, error) {
scraper := getScraper()
articles := getAllArticles(scraper)
return articles, nil
}
func getScraper() *colly.Collector {
s := colly.NewCollector(
return colly.NewCollector(
colly.AllowedDomains(DumpDomain),
)
return s
}
func getAllArticles(s *colly.Collector) []string {
@@ -58,14 +62,3 @@ func isValidArticle(a string) bool {
articleIndex := article[0]
return articleIndex >= 48 && articleIndex <= 57
}
func FetchArticles() error {
scraper := getScraper()
articles := getAllArticles(scraper)
for _, a := range articles {
fmt.Println(a)
}
return nil
}