diff --git a/cmd/get.go b/cmd/get.go index 0a8a01aa..fb4e2146 100644 --- a/cmd/get.go +++ b/cmd/get.go @@ -34,7 +34,7 @@ func getCMDsFlags(getCmd *cobra.Command) { getCmd.PersistentFlags().String("cookies", "", "File containing cookies that will be used for requests.") getCmd.PersistentFlags().Bool("keep-cookies", false, "Keep a global cookie jar") getCmd.PersistentFlags().Bool("headless", false, "Use headless browsers instead of standard GET requests.") - getCmd.PersistentFlags().Bool("local-seencheck", false, "Simple local seencheck to avoid re-crawling of URIs.") + getCmd.PersistentFlags().Bool("disable-seencheck", false, "Disable the (remote or local) seencheck that avoid re-crawling of URIs.") getCmd.PersistentFlags().Bool("json", false, "Output logs in JSON") getCmd.PersistentFlags().Bool("debug", false, "") getCmd.PersistentFlags().Bool("api", false, "Enable API") diff --git a/config/config.go b/config/config.go index 7c26e722..664d0007 100644 --- a/config/config.go +++ b/config/config.go @@ -58,7 +58,7 @@ type Config struct { HQBatchSize int64 `mapstructure:"hq-batch-size"` KeepCookies bool `mapstructure:"keep-cookies"` Headless bool `mapstructure:"headless"` - LocalSeencheck bool `mapstructure:"local-seencheck"` + DisableSeencheck bool `mapstructure:"disable-seencheck"` JSON bool `mapstructure:"json"` Debug bool `mapstructure:"debug"` LiveStats bool `mapstructure:"live-stats"` diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index 17733792..04436776 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -149,18 +149,20 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo // Seencheck the URL if c.UseSeencheck { - found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "seed") - if found { - return nil, errors.New("URL from redirection has already been seen") - } - } else if c.UseHQ { - isNewURL, err := c.HQSeencheckURL(URL) - if err != nil { - return resp, err - } + if c.UseHQ { + isNewURL, err := c.HQSeencheckURL(URL) + if err != nil { + return resp, err + } - if !isNewURL { - return nil, errors.New("URL from redirection has already been seen") + if !isNewURL { + return nil, errors.New("URL from redirection has already been seen") + } + } else { + found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "seed") + if found { + return nil, errors.New("URL from redirection has already been seen") + } } } @@ -432,18 +434,19 @@ func (c *Crawl) Capture(item *queue.Item) error { // because we already archived the URLs, we just want them to be added // to the seencheck table. if c.UseSeencheck { - for _, cfstreamURL := range cfstreamURLs { - c.Seencheck.SeencheckURL(cfstreamURL, "asset") - } - } else if c.UseHQ { - _, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs)) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{ - "urls": cfstreamURLs, - })).Error("error while seenchecking assets via HQ") + if c.UseHQ { + _, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs)) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{ + "urls": cfstreamURLs, + })).Error("error while seenchecking assets via HQ") + } + } else { + for _, cfstreamURL := range cfstreamURLs { + c.Seencheck.SeencheckURL(cfstreamURL, "asset") + } } } - // Log the archived URLs for _, cfstreamURL := range cfstreamURLs { c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{ @@ -511,38 +514,40 @@ func (c *Crawl) Capture(item *queue.Item) error { // seencheck DB. If they are, then they are skipped. // Else, if we use HQ, then we use HQ's seencheck. if c.UseSeencheck { - seencheckedBatch := []*url.URL{} + if c.UseHQ { + seencheckedURLs, err := c.HQSeencheckURLs(assets) + // We ignore the error here because we don't want to slow down the crawl + // if HQ is down or if the request failed. So if we get an error, we just + // continue with the original list of assets. + if err != nil { + c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ + "urls": assets, + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + })).Error("error while seenchecking assets via HQ") + } else { + assets = seencheckedURLs + } - for _, URL := range assets { - found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset") - if found { - continue + if len(assets) == 0 { + return err } - seencheckedBatch = append(seencheckedBatch, URL) - } + } else { + seencheckedBatch := []*url.URL{} - if len(seencheckedBatch) == 0 { - return err - } + for _, URL := range assets { + found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset") + if found { + continue + } + seencheckedBatch = append(seencheckedBatch, URL) + } - assets = seencheckedBatch - } else if c.UseHQ { - seencheckedURLs, err := c.HQSeencheckURLs(assets) - // We ignore the error here because we don't want to slow down the crawl - // if HQ is down or if the request failed. So if we get an error, we just - // continue with the original list of assets. - if err != nil { - c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{ - "urls": assets, - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - })).Error("error while seenchecking assets via HQ") - } else { - assets = seencheckedURLs - } + if len(seencheckedBatch) == 0 { + return err + } - if len(assets) == 0 { - return err + assets = seencheckedBatch } } diff --git a/internal/pkg/crawl/config.go b/internal/pkg/crawl/config.go index e5fa7fa0..ee9003bb 100644 --- a/internal/pkg/crawl/config.go +++ b/internal/pkg/crawl/config.go @@ -227,7 +227,7 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) { }) } - c.UseSeencheck = config.LocalSeencheck + c.UseSeencheck = !config.DisableSeencheck c.HTTPTimeout = config.HTTPTimeout c.MaxConcurrentRequestsPerDomain = config.MaxConcurrentRequestsPerDomain c.RateLimitDelay = config.ConcurrentSleepLength diff --git a/internal/pkg/crawl/crawl.go b/internal/pkg/crawl/crawl.go index 4959ff78..7e134d5c 100644 --- a/internal/pkg/crawl/crawl.go +++ b/internal/pkg/crawl/crawl.go @@ -2,6 +2,7 @@ package crawl import ( + "os" "path" "sync" "time" @@ -155,7 +156,7 @@ func (c *Crawl) Start() (err error) { go c.HQProducer() go c.HQFinisher() go c.HQWebsocket() - } else { + } else if len(c.SeedList) > 0 { // Temporarily disable handover as it's not needed enableBackHandover := make(chan struct{}) syncHandover := make(chan struct{}) @@ -166,6 +167,27 @@ func (c *Crawl) Start() (err error) { <-syncHandover } + + // Dedupe the seeds list + if c.UseSeencheck { + c.Log.Info("Seenchecking seeds list..") + + var seencheckedSeeds []queue.Item + var duplicates int + for i := 0; i < len(c.SeedList); i++ { + if c.Seencheck.SeencheckURL(c.SeedList[i].URL.String(), "seed") { + duplicates++ + continue + } + + seencheckedSeeds = append(seencheckedSeeds, c.SeedList[i]) + } + + c.SeedList = seencheckedSeeds + + c.Log.Info("Seencheck done", "duplicates", duplicates) + } + // Push the seed list to the queue c.Log.Info("Pushing seeds in the local queue..") for i := 0; i < len(c.SeedList); i += 100000 { @@ -199,6 +221,9 @@ func (c *Crawl) Start() (err error) { close(syncHandover) c.Log.Info("All seeds are now in queue") + } else { + c.Log.Info("No seeds to crawl") + os.Exit(0) } // Start the workers pool by building all the workers and starting them diff --git a/internal/pkg/crawl/outlinks.go b/internal/pkg/crawl/outlinks.go index 303c9a69..980a6bde 100644 --- a/internal/pkg/crawl/outlinks.go +++ b/internal/pkg/crawl/outlinks.go @@ -84,6 +84,13 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.Wa continue } + // Seencheck the outlink + if c.UseSeencheck { + if c.Seencheck.SeencheckURL(utils.URLToString(outlink), "seed") { + continue + } + } + if c.DomainsCrawl && strings.Contains(item.URL.Host, outlink.Host) && item.Hop == 0 { newItem, err := queue.NewItem(outlink, item.URL, "seed", 0, "", false) if err != nil {