Skip to content

Commit

Permalink
better handling of context cancellation to stop program
Browse files Browse the repository at this point in the history
  • Loading branch information
cornelk committed Jan 12, 2024
1 parent bc0bcaf commit 1d3fc7e
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 14 deletions.
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,10 @@ func run(ctx context.Context, args arguments) error {

logger.Info("Scraping", log.String("url", sc.URL.String()))
if err = sc.Start(ctx); err != nil {
if errors.Is(err, context.Canceled) {
os.Exit(0)
}

return fmt.Errorf("scraping '%s': %w", sc.URL, err)
}
}
Expand Down
26 changes: 18 additions & 8 deletions scraper/download.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package scraper
import (
"bytes"
"context"
"fmt"
"net/url"
"os"

Expand All @@ -14,7 +15,7 @@ import (
// a downloaded file content before it will be stored on disk.
type assetProcessor func(URL *url.URL, buf *bytes.Buffer) *bytes.Buffer

func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) {
func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {
references, err := index.URLs("img")
if err != nil {
s.logger.Error("Getting img nodes URLs failed", log.Err(err))
Expand All @@ -26,34 +27,41 @@ func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index
s.logger.Error("Getting link nodes URLs failed", log.Err(err))
}
for _, ur := range references {
s.downloadAsset(ctx, ur, s.checkCSSForUrls)
if err := s.downloadAsset(ctx, ur, s.checkCSSForUrls); err != nil {
return err
}
}

references, err = index.URLs("script")
if err != nil {
s.logger.Error("Getting script nodes URLs failed", log.Err(err))
}
for _, ur := range references {
s.downloadAsset(ctx, ur, nil)
if err := s.downloadAsset(ctx, ur, nil); err != nil {
return err
}
}

for _, image := range s.imagesQueue {
s.downloadAsset(ctx, image, s.checkImageForRecode)
if err := s.downloadAsset(ctx, image, s.checkImageForRecode); err != nil {
return err
}
}
s.imagesQueue = nil
return nil
}

// downloadAsset downloads an asset if it does not exist on disk yet.
func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) {
func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor assetProcessor) error {
urlFull := u.String()

if !s.shouldURLBeDownloaded(u, 0, true) {
return
return nil
}

filePath := s.getFilePath(u, false)
if _, err := os.Stat(filePath); !os.IsNotExist(err) {
return // exists already on disk
return nil // exists already on disk
}

s.logger.Info("Downloading asset", log.String("url", urlFull))
Expand All @@ -64,7 +72,7 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
s.logger.Error("Downloading asset failed",
log.String("url", urlFull),
log.Err(err))
return
return fmt.Errorf("downloading asset: %w", err)
}

if processor != nil {
Expand All @@ -77,4 +85,6 @@ func (s *Scraper) downloadAsset(ctx context.Context, u *url.URL, processor asset
log.String("file", filePath),
log.Err(err))
}

return nil
}
20 changes: 14 additions & 6 deletions scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,19 +146,23 @@ func (s *Scraper) Start(ctx context.Context) error {
return errors.New("start page is excluded from downloading")
}

s.downloadWebpage(ctx, s.URL, 0)
if err := s.downloadWebpage(ctx, s.URL, 0); err != nil {
return err
}

for len(s.webPageQueue) > 0 {
ur := s.webPageQueue[0]
s.webPageQueue = s.webPageQueue[1:]
currentDepth := s.webPageQueueDepth[ur.String()]
s.downloadWebpage(ctx, ur, currentDepth+1)
if err := s.downloadWebpage(ctx, ur, currentDepth+1); err != nil {
return err
}
}

return nil
}

func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth uint) {
func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth uint) error {
buf := &bytes.Buffer{}

s.logger.Info("Downloading webpage", log.String("url", u.String()))
Expand All @@ -168,7 +172,7 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth
s.logger.Error("Processing HTTP Request failed",
log.String("url", u.String()),
log.Err(err))
return
return err
}

fileExtension := ""
Expand All @@ -189,15 +193,17 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth
s.logger.Error("Parsing HTML failed",
log.String("url", u.String()),
log.Err(err))
return
return fmt.Errorf("parsing HTML: %w", err)
}

index := htmlindex.New()
index.Index(u, doc)

s.storeDownload(u, buf, doc, index, fileExtension)

s.downloadReferences(ctx, index)
if err := s.downloadReferences(ctx, index); err != nil {
return err
}

// check first and download afterward to not hit max depth limit for
// start page links because of recursive linking
Expand All @@ -213,6 +219,8 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth
s.webPageQueueDepth[ur.String()] = currentDepth
}
}

return nil
}

func (s *Scraper) sendHTTPRequest(ctx context.Context, u *url.URL, buf *bytes.Buffer) (*url.URL, error) {
Expand Down

0 comments on commit 1d3fc7e

Please sign in to comment.