From bc0bcafeaaa62e36c2950c21dd8677aa5f865850 Mon Sep 17 00:00:00 2001 From: cornelk Date: Fri, 12 Jan 2024 08:39:46 -0600 Subject: [PATCH] scraper: use a queue for webpages to download to avoid recursion --- scraper/scraper.go | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/scraper/scraper.go b/scraper/scraper.go index fce885f..0ba722e 100644 --- a/scraper/scraper.go +++ b/scraper/scraper.go @@ -54,7 +54,9 @@ type Scraper struct { // key is the URL of page or asset processed map[string]struct{} - imagesQueue []*url.URL + imagesQueue []*url.URL + webPageQueue []*url.URL + webPageQueueDepth map[string]uint } // New creates a new Scraper instance. @@ -120,7 +122,9 @@ func New(logger *log.Logger, cfg Config) (*Scraper, error) { includes: includes, excludes: excludes, - processed: make(map[string]struct{}), + processed: map[string]struct{}{}, + + webPageQueueDepth: map[string]uint{}, } if s.config.Username != "" { @@ -138,13 +142,19 @@ func (s *Scraper) Start(ctx context.Context) error { } } - p := s.URL.Path - if p == "" { - p = "/" + if !s.shouldURLBeDownloaded(s.URL, 0, false) { + return errors.New("start page is excluded from downloading") } - s.processed[p] = struct{}{} s.downloadWebpage(ctx, s.URL, 0) + + for len(s.webPageQueue) > 0 { + ur := s.webPageQueue[0] + s.webPageQueue = s.webPageQueue[1:] + currentDepth := s.webPageQueueDepth[ur.String()] + s.downloadWebpage(ctx, ur, currentDepth+1) + } + return nil } @@ -189,7 +199,6 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth s.downloadReferences(ctx, index) - var toScrape []*url.URL // check first and download afterward to not hit max depth limit for // start page links because of recursive linking // a hrefs @@ -200,13 +209,10 @@ func (s *Scraper) downloadWebpage(ctx context.Context, u *url.URL, currentDepth for _, ur := range references { if s.shouldURLBeDownloaded(ur, currentDepth, false) { - toScrape = append(toScrape, ur) + s.webPageQueue = append(s.webPageQueue, ur) + s.webPageQueueDepth[ur.String()] = currentDepth } } - - for _, URL := range toScrape { - s.downloadWebpage(ctx, URL, currentDepth+1) - } } func (s *Scraper) sendHTTPRequest(ctx context.Context, u *url.URL, buf *bytes.Buffer) (*url.URL, error) {