Skip to content

Commit

Permalink
changeing stop_crawl behaviour to stop new links, but finish processi…
Browse files Browse the repository at this point in the history
…ng all pages in queue
  • Loading branch information
efrat-safanov committed Nov 25, 2012
1 parent d87b6e1 commit 98d6b15
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 9 deletions.
11 changes: 5 additions & 6 deletions lib/anemone/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -180,23 +180,22 @@ def run
loop do
page = page_queue.deq
@pages.touch_key page.url
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
puts "#{page.url} Queue: #{link_queue.size} PageQueue #{page_queue.size}" if @opts[:verbose]
do_page_blocks page
page.discard_doc! if @opts[:discard_page_bodies]

links = links_to_follow page
if link_queue.num_waiting < @opts[:links_limit]

if link_queue.size < @opts[:links_limit] and !@stop_crawl
links = links_to_follow page
links.each do |link|
link_queue << [link, page.url.dup, page.depth + 1]
end
@pages.touch_keys links
end


@pages[page.url] = page

if @stop_crawl
page_queue.clear
link_queue.clear
end

Expand All @@ -205,7 +204,7 @@ def run
until link_queue.num_waiting == @tentacles.size
Thread.pass
end
if page_queue.empty? || @stop_crawl
if page_queue.empty?
@tentacles.size.times { link_queue << :END }
break
end
Expand Down
6 changes: 3 additions & 3 deletions spec/core_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,13 @@ module Anemone

it "should stop crawl if requested" do
num_pages = 0
Anemone.crawl(@pages[0].url) do |anemone|
Anemone.crawl(@pages[0].url, @opts.merge({:pages_queue_limit => 1})) do |anemone|
anemone.on_every_page do
num_pages += 1
anemone.stop_crawl
anemone.stop_crawl if num_pages == 2
end
end
num_pages.should == 1
num_pages.should == 2
end

it "should limit number of links per crawl" do
Expand Down

0 comments on commit 98d6b15

Please sign in to comment.