From 257f29934300346079d97b9846cc7d8985077e98 Mon Sep 17 00:00:00 2001 From: Ben Caldwell Date: Mon, 27 Jan 2014 18:12:14 +1100 Subject: [PATCH] Give focus_crawl a chance to access page body before discarding it --- lib/anemone/core.rb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index d1629a49..70731fdc 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -50,7 +50,7 @@ class Core :accept_cookies => false, # skip any link with a query string? e.g. http://foo.com/?u=user :skip_query_strings => false, - # proxy server hostname + # proxy server hostname :proxy_host => nil, # proxy server port number :proxy_port => false, @@ -164,10 +164,11 @@ def run page = page_queue.deq @pages.touch_key page.url puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose] + do_page_blocks page + links = links_to_follow page page.discard_doc! if @opts[:discard_page_bodies] - links = links_to_follow page links.each do |link| link_queue << [link, page.url.dup, page.depth + 1] end @@ -281,7 +282,7 @@ def too_deep?(from_page) false end end - + # # Returns +true+ if *link* should not be visited because # it has a query string and +skip_query_strings+ is true.