Skip to content

Commit

Permalink
Add link and page PORO helpers (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
goulvench authored Feb 14, 2025
2 parents 4a775f5 + aaf2c55 commit 8a3623c
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 0 deletions.
8 changes: 8 additions & 0 deletions app/models/link.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
Link = Data.define(:href, :text) do
def initialize(href:, text:)
super(href: href.to_s, text: text&.squish || "")
end

def to_str = href
def ==(other) = href == other.href
end
34 changes: 34 additions & 0 deletions app/models/page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
require "net/http"

class Page < Data.define(:url, :root)
CACHE_TTL = 10.minutes

def initialize(url:, root: nil)
super(url: URI.parse(url), root: URI.parse(root || url))
end

def path = url.to_s.delete_prefix(root.to_s)
def root? = url == root
def html = Rails.cache.fetch(url, expires_in: CACHE_TTL) { Net::HTTP.get(URI.parse(url)) }
def dom = Nokogiri::HTML(html)
def css(selector) = dom.css(selector)
def title = dom.title&.squish
def text = dom.text&.squish
def headings = dom.css("h1,h2,h3,h4,h5,h6").collect(&:text).collect(&:squish)
def internal_links = links.select { |link| link.href.start_with?(root) }
def external_links = links - internal_links

def links
dom.css("a[href]:not([href^='#']):not([href^=mailto]):not([href^=tel])").collect do |link|
href = link["href"]
uri = URI.parse(href)
if uri.relative?
relative_path = href.start_with?("/") ? href[1..-1] : href
uri = URI.parse(root.to_s.chomp("/") + "/" + relative_path)
end
uri.fragment = nil
uri.query = nil
Link.new(uri, link.text)
end.compact
end
end
47 changes: 47 additions & 0 deletions spec/models/link_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
require "rails_helper"

RSpec.describe Link do
describe "#initialize" do
it "creates a Link with href and text" do
link = Link.new(href: "https://example.com", text: "Example")
expect(link.href).to eq("https://example.com")
expect(link.text).to eq("Example")
end

it "converts href to string" do
link = Link.new(href: URI("https://example.com"), text: "Example")
expect(link.href).to eq("https://example.com")
end

it "squishes text" do
link = Link.new(href: "https://example.com", text: " Example Text ")
expect(link.text).to eq("Example Text")
end

it "handles nil text" do
link = Link.new(href: "https://example.com", text: nil)
expect(link.text).to eq("")
end
end

describe "#to_str" do
it "returns the href" do
link = Link.new(href: "https://example.com", text: "Example")
expect(link.to_str).to eq("https://example.com")
end
end

describe "#==" do
it "considers links equal if they have the same href" do
link1 = Link.new(href: "https://example.com", text: "Example 1")
link2 = Link.new(href: "https://example.com", text: "Example 2")
expect(link1).to eq(link2)
end

it "considers links different if they have different hrefs" do
link1 = Link.new(href: "https://example1.com", text: "Example")
link2 = Link.new(href: "https://example2.com", text: "Example")
expect(link1).not_to eq(link2)
end
end
end
156 changes: 156 additions & 0 deletions spec/models/page_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
require "rails_helper"

RSpec.describe Page do
let(:root) { "https://example.com" }
let(:url) { "https://example.com/about" }
let(:parsed_url) { URI.parse(url) }
let(:page) { described_class.new(url:, root:) }
let(:html_content) do
<<~HTML
<!DOCTYPE html>
<html>
<head>
<title>Example Page</title>
</head>
<body>
<h1>Main Heading</h1>
<h2>Sub Heading</h2>
<p>Some content</p>
<a href="/contact">Contact</a>
<a href="https://external.com">External</a>
<a href="tel:123456">Phone</a>
<a href="mailto:[email protected]">Email</a>
<a href="#section">Section</a>
<a href="relative/path">Relative</a>
</body>
</html>
HTML
end

before do
allow(Net::HTTP).to receive(:get).and_return(html_content)
end

describe "#path" do
it "returns the path portion of the URL" do
expect(page.path).to eq("/about")
end

context "when URL is the root URL" do
let(:url) { root }

it "returns an empty string" do
expect(page.path).to eq("")
end
end
end

describe "#root?" do
it "returns false when URL is not the root URL" do
expect(page.root?).to be false
end

context "when URL is the root URL" do
let(:url) { root }

it "returns true" do
expect(page.root?).to be true
end
end
end

describe "#html" do
it "fetches and caches the page content" do
expect(Rails.cache).to receive(:fetch)
.with(parsed_url, expires_in: described_class::CACHE_TTL)
.and_yield

expect(page.html).to eq(html_content)
expect(Net::HTTP).to have_received(:get)
.with(Addressable::URI.parse(url))
.once
end
end

describe "#dom" do
it "returns a Nokogiri::HTML document" do
expect(page.dom).to be_a(Nokogiri::HTML::Document)
end
end

describe "#css" do
it "forwards CSS selector queries to the DOM" do
expect(page.css("h1").first.text).to eq("Main Heading")
end
end

describe "#title" do
it "returns the page title" do
expect(page.title).to eq("Example Page")
end
end

describe "#text" do
it "returns the full text content" do
expect(page.text).to include("Main Heading", "Sub Heading", "Some content")
end
end

describe "#headings" do
it "returns an array of text, one line for each heading" do
expect(page.headings).to eq(["Main Heading", "Sub Heading"])
end
end

describe "#links" do
it "returns a hash of URLs and their link texts" do
expected_links = [
Link.new("https://example.com/contact", "Contact"),
Link.new("https://external.com", "External"),
Link.new("https://example.com/relative/path", "Relative"),
]
expect(page.links).to eq(expected_links)
end

it "excludes mailto and tel links" do
expect(page.links.collect(&:text)).not_to include("Phone", "Email")
end

it "excludes fragment-only links" do
expect(page.links.collect(&:text)).not_to include("Section")
end

it "resolves relative URLs" do
expect(page.links.collect(&:href)).to include("https://example.com/relative/path")
end

it "strips fragments and query parameters from URLs" do
html_with_params = html_content.gsub(
'<a href="https://external.com">',
'<a href="https://external.com?param=1#section">'
)
allow(Net::HTTP).to receive(:get).and_return(html_with_params)

expect(page.links.collect(&:href)).to include("https://external.com")
end
end

describe "#internal_links" do
it "returns only links that start with the root URL" do
expected_internal_links = [
Link.new("https://example.com/contact", "Contact"),
Link.new("https://example.com/relative/path", "Relative"),
]
expect(page.internal_links).to eq(expected_internal_links)
end
end

describe "#external_links" do
it "returns only links that don't start with the root URL" do
expected_external_links = [
Link.new("https://external.com", "External")
]
expect(page.external_links).to eq(expected_external_links)
end
end
end

0 comments on commit 8a3623c

Please sign in to comment.