From 21fd4d787848fa082ce1e21845d4728933b99fd0 Mon Sep 17 00:00:00 2001 From: MothOnMars Date: Wed, 24 Jan 2018 08:57:32 -0800 Subject: [PATCH] add #sitemaps method --- README.rdoc | 1 + lib/robotex.rb | 13 ++++++++- spec/robotex_spec.rb | 65 +++++++++++++++++++++++++++++++------------- 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/README.rdoc b/README.rdoc index 83c96aa..5326bdb 100644 --- a/README.rdoc +++ b/README.rdoc @@ -8,6 +8,7 @@ Usage: robotex = Robotex.new "My User Agent" robotex.allowed?("http://www.example.com/foo") robotex.delay!("http://www.example.com/foo") # wait until any specified Crawl-Delay has passed + robotex.sitemaps("http://www.example.com/") # return an array of sitemap urls == Acknowledgements diff --git a/lib/robotex.rb b/lib/robotex.rb index bf186ab..0d78364 100644 --- a/lib/robotex.rb +++ b/lib/robotex.rb @@ -15,6 +15,8 @@ class Robotex class ParsedRobots + attr_reader :sitemaps + def initialize(uri, user_agent) io = Robotex.get_robots_txt(uri, user_agent) @@ -25,6 +27,7 @@ def initialize(uri, user_agent) @disallows = {} @allows = {} @delays = {} + @sitemaps = [] agent = /.*/ io.each do |line| next if line =~ /^\s*(#.*|$)/ @@ -43,6 +46,8 @@ def initialize(uri, user_agent) @disallows[agent] << to_regex(value) when "crawl-delay" @delays[agent] = value.to_i + when "sitemap" + @sitemaps << URI.join(uri, value).to_s end end @@ -148,5 +153,11 @@ def delay!(uri) sleep delay - (Time.now - @last_accessed) if !!delay @last_accessed = Time.now end - + + # + # Returns an array of the sitemap urls specified in robots.txt + # + def sitemaps(uri) + parse_host(uri).sitemaps + end end diff --git a/spec/robotex_spec.rb b/spec/robotex_spec.rb index d8b4388..51963ca 100644 --- a/spec/robotex_spec.rb +++ b/spec/robotex_spec.rb @@ -1,25 +1,30 @@ require 'spec_helper' describe Robotex do + let(:robots) do + <<~ROBOTS + User-Agent: msnbot + Crawl-Delay: 20 - before(:all) do + User-Agent: bender + Disallow: /my_shiny_metal_ass + + User-Agent: * + Disallow: /login + Allow: / + + Disallow: /locked + Allow: /locked + ROBOTS + end + + let(:response) do + { body: robots, content_type: 'text/plain', status: [200, "OK"] } + end + + before do FakeWeb.allow_net_connect = false - robots = <<-END -User-Agent: msnbot -Crawl-Delay: 20 - -User-Agent: bender -Disallow: /my_shiny_metal_ass - -User-Agent: * -Disallow: /login -Allow: / - -Disallow: /locked -Allow: /locked -END - options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]} - FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options) + FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', response) end describe '#initialize' do @@ -73,6 +78,7 @@ robotex = Robotex.new robotex.delay(SPEC_DOMAIN).should be_nil end + end context 'when Crawl-Delay is specified for the user-agent' do it 'returns the delay as a Fixnum' do @@ -80,8 +86,29 @@ robotex.delay(SPEC_DOMAIN).should == 20 end end - end end -end + describe '#sitemaps' do + let(:robots) do + <<~ROBOTS + Sitemap: http://www.example.com/sitemap_1.xml + Sitemap: http://www.example.com/sitemap_2.xml + ROBOTS + end + + it 'returns an array of sitemaps' do + robotex = Robotex.new + robotex.sitemaps(SPEC_DOMAIN).should == %w[http://www.example.com/sitemap_1.xml + http://www.example.com/sitemap_2.xml] + end + context 'when the sitemap url is relative' do + let(:robots) { 'Sitemap: /relative.xml' } + + it 'returns the sitemap' do + robotex = Robotex.new + robotex.sitemaps(SPEC_DOMAIN).should == ['http://www.example.com/relative.xml'] + end + end + end +end