From 3e2f0a834bf587209557f0bfc7fc817f9744eb1c Mon Sep 17 00:00:00 2001 From: Goulven Champenois Date: Fri, 14 Feb 2025 14:08:44 +0100 Subject: [PATCH 1/2] Add immutable Link model --- app/models/link.rb | 8 +++++++ spec/models/link_spec.rb | 47 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 app/models/link.rb create mode 100644 spec/models/link_spec.rb diff --git a/app/models/link.rb b/app/models/link.rb new file mode 100644 index 0000000..1192f57 --- /dev/null +++ b/app/models/link.rb @@ -0,0 +1,8 @@ +Link = Data.define(:href, :text) do + def initialize(href:, text:) + super(href: href.to_s, text: text&.squish || "") + end + + def to_str = href + def ==(other) = href == other.href +end diff --git a/spec/models/link_spec.rb b/spec/models/link_spec.rb new file mode 100644 index 0000000..43e63c4 --- /dev/null +++ b/spec/models/link_spec.rb @@ -0,0 +1,47 @@ +require "rails_helper" + +RSpec.describe Link do + describe "#initialize" do + it "creates a Link with href and text" do + link = Link.new(href: "https://example.com", text: "Example") + expect(link.href).to eq("https://example.com") + expect(link.text).to eq("Example") + end + + it "converts href to string" do + link = Link.new(href: URI("https://example.com"), text: "Example") + expect(link.href).to eq("https://example.com") + end + + it "squishes text" do + link = Link.new(href: "https://example.com", text: " Example Text ") + expect(link.text).to eq("Example Text") + end + + it "handles nil text" do + link = Link.new(href: "https://example.com", text: nil) + expect(link.text).to eq("") + end + end + + describe "#to_str" do + it "returns the href" do + link = Link.new(href: "https://example.com", text: "Example") + expect(link.to_str).to eq("https://example.com") + end + end + + describe "#==" do + it "considers links equal if they have the same href" do + link1 = Link.new(href: "https://example.com", text: "Example 1") + link2 = Link.new(href: "https://example.com", text: "Example 2") + expect(link1).to eq(link2) + end + + it "considers links different if they have different hrefs" do + link1 = Link.new(href: "https://example1.com", text: "Example") + link2 = Link.new(href: "https://example2.com", text: "Example") + expect(link1).not_to eq(link2) + end + end +end From aaf2c55cf38956dd1b8990610bf3be175edc05aa Mon Sep 17 00:00:00 2001 From: Goulven Champenois Date: Fri, 14 Feb 2025 14:22:28 +0100 Subject: [PATCH 2/2] Add an immutable page model with useful helpers --- app/models/page.rb | 34 +++++++++ spec/models/page_spec.rb | 156 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 app/models/page.rb create mode 100644 spec/models/page_spec.rb diff --git a/app/models/page.rb b/app/models/page.rb new file mode 100644 index 0000000..2ac29b8 --- /dev/null +++ b/app/models/page.rb @@ -0,0 +1,34 @@ +require "net/http" + +class Page < Data.define(:url, :root) + CACHE_TTL = 10.minutes + + def initialize(url:, root: nil) + super(url: URI.parse(url), root: URI.parse(root || url)) + end + + def path = url.to_s.delete_prefix(root.to_s) + def root? = url == root + def html = Rails.cache.fetch(url, expires_in: CACHE_TTL) { Net::HTTP.get(URI.parse(url)) } + def dom = Nokogiri::HTML(html) + def css(selector) = dom.css(selector) + def title = dom.title&.squish + def text = dom.text&.squish + def headings = dom.css("h1,h2,h3,h4,h5,h6").collect(&:text).collect(&:squish) + def internal_links = links.select { |link| link.href.start_with?(root) } + def external_links = links - internal_links + + def links + dom.css("a[href]:not([href^='#']):not([href^=mailto]):not([href^=tel])").collect do |link| + href = link["href"] + uri = URI.parse(href) + if uri.relative? + relative_path = href.start_with?("/") ? href[1..-1] : href + uri = URI.parse(root.to_s.chomp("/") + "/" + relative_path) + end + uri.fragment = nil + uri.query = nil + Link.new(uri, link.text) + end.compact + end +end diff --git a/spec/models/page_spec.rb b/spec/models/page_spec.rb new file mode 100644 index 0000000..a1ad187 --- /dev/null +++ b/spec/models/page_spec.rb @@ -0,0 +1,156 @@ +require "rails_helper" + +RSpec.describe Page do + let(:root) { "https://example.com" } + let(:url) { "https://example.com/about" } + let(:parsed_url) { URI.parse(url) } + let(:page) { described_class.new(url:, root:) } + let(:html_content) do + <<~HTML + + + + Example Page + + +

Main Heading

+

Sub Heading

+

Some content

+ Contact + External + Phone + Email + Section + Relative + + + HTML + end + + before do + allow(Net::HTTP).to receive(:get).and_return(html_content) + end + + describe "#path" do + it "returns the path portion of the URL" do + expect(page.path).to eq("/about") + end + + context "when URL is the root URL" do + let(:url) { root } + + it "returns an empty string" do + expect(page.path).to eq("") + end + end + end + + describe "#root?" do + it "returns false when URL is not the root URL" do + expect(page.root?).to be false + end + + context "when URL is the root URL" do + let(:url) { root } + + it "returns true" do + expect(page.root?).to be true + end + end + end + + describe "#html" do + it "fetches and caches the page content" do + expect(Rails.cache).to receive(:fetch) + .with(parsed_url, expires_in: described_class::CACHE_TTL) + .and_yield + + expect(page.html).to eq(html_content) + expect(Net::HTTP).to have_received(:get) + .with(Addressable::URI.parse(url)) + .once + end + end + + describe "#dom" do + it "returns a Nokogiri::HTML document" do + expect(page.dom).to be_a(Nokogiri::HTML::Document) + end + end + + describe "#css" do + it "forwards CSS selector queries to the DOM" do + expect(page.css("h1").first.text).to eq("Main Heading") + end + end + + describe "#title" do + it "returns the page title" do + expect(page.title).to eq("Example Page") + end + end + + describe "#text" do + it "returns the full text content" do + expect(page.text).to include("Main Heading", "Sub Heading", "Some content") + end + end + + describe "#headings" do + it "returns an array of text, one line for each heading" do + expect(page.headings).to eq(["Main Heading", "Sub Heading"]) + end + end + + describe "#links" do + it "returns a hash of URLs and their link texts" do + expected_links = [ + Link.new("https://example.com/contact", "Contact"), + Link.new("https://external.com", "External"), + Link.new("https://example.com/relative/path", "Relative"), + ] + expect(page.links).to eq(expected_links) + end + + it "excludes mailto and tel links" do + expect(page.links.collect(&:text)).not_to include("Phone", "Email") + end + + it "excludes fragment-only links" do + expect(page.links.collect(&:text)).not_to include("Section") + end + + it "resolves relative URLs" do + expect(page.links.collect(&:href)).to include("https://example.com/relative/path") + end + + it "strips fragments and query parameters from URLs" do + html_with_params = html_content.gsub( + '', + '' + ) + allow(Net::HTTP).to receive(:get).and_return(html_with_params) + + expect(page.links.collect(&:href)).to include("https://external.com") + end + end + + describe "#internal_links" do + it "returns only links that start with the root URL" do + expected_internal_links = [ + Link.new("https://example.com/contact", "Contact"), + Link.new("https://example.com/relative/path", "Relative"), + ] + expect(page.internal_links).to eq(expected_internal_links) + end + end + + describe "#external_links" do + it "returns only links that don't start with the root URL" do + expected_external_links = [ + Link.new("https://external.com", "External") + ] + expect(page.external_links).to eq(expected_external_links) + end + end +end