Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add #sitemaps method #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Usage:
robotex = Robotex.new "My User Agent"
robotex.allowed?("http://www.example.com/foo")
robotex.delay!("http://www.example.com/foo") # wait until any specified Crawl-Delay has passed
robotex.sitemaps("http://www.example.com/") # return an array of sitemap urls

== Acknowledgements

Expand Down
13 changes: 12 additions & 1 deletion lib/robotex.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class Robotex

class ParsedRobots

attr_reader :sitemaps

def initialize(uri, user_agent)
io = Robotex.get_robots_txt(uri, user_agent)

Expand All @@ -25,6 +27,7 @@ def initialize(uri, user_agent)
@disallows = {}
@allows = {}
@delays = {}
@sitemaps = []
agent = /.*/
io.each do |line|
next if line =~ /^\s*(#.*|$)/
Expand All @@ -43,6 +46,8 @@ def initialize(uri, user_agent)
@disallows[agent] << to_regex(value)
when "crawl-delay"
@delays[agent] = value.to_i
when "sitemap"
@sitemaps << URI.join(uri, value).to_s
end
end

Expand Down Expand Up @@ -148,5 +153,11 @@ def delay!(uri)
sleep delay - (Time.now - @last_accessed) if !!delay
@last_accessed = Time.now
end


#
# Returns an array of the sitemap urls specified in robots.txt
#
def sitemaps(uri)
parse_host(uri).sitemaps
end
end
65 changes: 46 additions & 19 deletions spec/robotex_spec.rb
Original file line number Diff line number Diff line change
@@ -1,25 +1,30 @@
require 'spec_helper'

describe Robotex do
let(:robots) do
<<~ROBOTS
User-Agent: msnbot
Crawl-Delay: 20

before(:all) do
User-Agent: bender
Disallow: /my_shiny_metal_ass

User-Agent: *
Disallow: /login
Allow: /

Disallow: /locked
Allow: /locked
ROBOTS
end

let(:response) do
{ body: robots, content_type: 'text/plain', status: [200, "OK"] }
end

before do
FakeWeb.allow_net_connect = false
robots = <<-END
User-Agent: msnbot
Crawl-Delay: 20

User-Agent: bender
Disallow: /my_shiny_metal_ass

User-Agent: *
Disallow: /login
Allow: /

Disallow: /locked
Allow: /locked
END
options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]}
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options)
FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', response)
end

describe '#initialize' do
Expand Down Expand Up @@ -73,15 +78,37 @@
robotex = Robotex.new
robotex.delay(SPEC_DOMAIN).should be_nil
end
end

context 'when Crawl-Delay is specified for the user-agent' do
it 'returns the delay as a Fixnum' do
robotex = Robotex.new('msnbot')
robotex.delay(SPEC_DOMAIN).should == 20
end
end
end
end

end
describe '#sitemaps' do
let(:robots) do
<<~ROBOTS
Sitemap: http://www.example.com/sitemap_1.xml
Sitemap: http://www.example.com/sitemap_2.xml
ROBOTS
end

it 'returns an array of sitemaps' do
robotex = Robotex.new
robotex.sitemaps(SPEC_DOMAIN).should == %w[http://www.example.com/sitemap_1.xml
http://www.example.com/sitemap_2.xml]
end

context 'when the sitemap url is relative' do
let(:robots) { 'Sitemap: /relative.xml' }

it 'returns the sitemap' do
robotex = Robotex.new
robotex.sitemaps(SPEC_DOMAIN).should == ['http://www.example.com/relative.xml']
end
end
end
end