diff --git a/README.rdoc b/README.rdoc index 83c96aa..5326bdb 100644 --- a/README.rdoc +++ b/README.rdoc @@ -8,6 +8,7 @@ Usage: robotex = Robotex.new "My User Agent" robotex.allowed?("http://www.example.com/foo") robotex.delay!("http://www.example.com/foo") # wait until any specified Crawl-Delay has passed + robotex.sitemaps("http://www.example.com/") # return an array of sitemap urls == Acknowledgements diff --git a/lib/robotex.rb b/lib/robotex.rb index bf186ab..07d4d32 100644 --- a/lib/robotex.rb +++ b/lib/robotex.rb @@ -25,6 +25,7 @@ def initialize(uri, user_agent) @disallows = {} @allows = {} @delays = {} + @sitemaps = [] agent = /.*/ io.each do |line| next if line =~ /^\s*(#.*|$)/ @@ -43,6 +44,8 @@ def initialize(uri, user_agent) @disallows[agent] << to_regex(value) when "crawl-delay" @delays[agent] = value.to_i + when "sitemap" + @sitemaps << value end end @@ -86,6 +89,10 @@ def delay(user_agent) end nil end + + def sitemaps(uri) + @sitemaps + end protected @@ -148,5 +155,11 @@ def delay!(uri) sleep delay - (Time.now - @last_accessed) if !!delay @last_accessed = Time.now end - + + # + # Returns an array of the sitemap urls specified in robots.txt + # + def sitemaps(uri) + parse_host(uri).sitemaps(uri) + end end diff --git a/spec/robotex_spec.rb b/spec/robotex_spec.rb index d8b4388..1f67c83 100644 --- a/spec/robotex_spec.rb +++ b/spec/robotex_spec.rb @@ -5,6 +5,9 @@ before(:all) do FakeWeb.allow_net_connect = false robots = <<-END +Sitemap: http://www.example.com/sitemap_1.xml +Sitemap: http://www.example.com/sitemap_2.xml + User-Agent: msnbot Crawl-Delay: 20 @@ -73,6 +76,7 @@ robotex = Robotex.new robotex.delay(SPEC_DOMAIN).should be_nil end + end context 'when Crawl-Delay is specified for the user-agent' do it 'returns the delay as a Fixnum' do @@ -80,8 +84,13 @@ robotex.delay(SPEC_DOMAIN).should == 20 end end - end end + describe '#sitemaps' do + it 'returns an array of sitemaps' do + sitemaps = ['http://www.example.com/sitemap_1.xml','http://www.example.com/sitemap_2.xml'] + robotex = Robotex.new + robotex.sitemaps(SPEC_DOMAIN).should == sitemaps + end + end end -