diff --git a/Cargo.toml b/Cargo.toml index e6a6386..3a3aaf0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,3 +33,8 @@ harness = false [[bench]] name = "browser" harness = false + +[profile.release] +strip = true +opt-level = "z" +lto = true \ No newline at end of file diff --git a/benches/browser.rs b/benches/browser.rs index 086a406..683950d 100644 --- a/benches/browser.rs +++ b/benches/browser.rs @@ -98,9 +98,7 @@ fn criterion_benchmark(c: &mut Criterion) { .measurement_time(std::time::Duration::from_secs(20)); // 1.54 seconds - group.bench_function("Create new browser basic", |b| { - b.iter(create_connection) - }); + group.bench_function("Create new browser basic", |b| b.iter(create_connection)); // 0.943 seconds group.bench_function("Create new browser headless", |b| { diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..72dd056 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,9 @@ +blank_lines_upper_bound = 2 +combine_control_expr = false +edition = "2021" +force_multiline_blocks = true +format_code_in_doc_comments = true +format_generated_files = false +imports_layout = "HorizontalVertical" +imports_granularity = "Crate" +group_imports = "StdExternalCrate" \ No newline at end of file diff --git a/src/db/helpers.rs b/src/db/helpers.rs index 9ddad09..72dd6c3 100644 --- a/src/db/helpers.rs +++ b/src/db/helpers.rs @@ -1,13 +1,12 @@ //! Database operation helpers for sqlite, using diesel -use diesel::prelude::*; +use diesel::{prelude::*, RunQueryDsl}; -use diesel::RunQueryDsl; - -use super::models::{Stream, StreamNew}; -use super::schema; -use super::schema::stream; -use super::schema::stream::dsl::*; +use super::{ + models::{Stream, StreamNew}, + schema, + schema::{stream, stream::dsl::*}, +}; pub fn establish_connection() -> Result { let database_url = format!("{}/sports.db", std::env::temp_dir().display()); @@ -15,13 +14,11 @@ pub fn establish_connection() -> Result { Ok(SqliteConnection::establish(&database_url)?) } -pub fn create_stream( - conn: &mut SqliteConnection, - new_stream: &StreamNew, -) -> Result { - Ok(diesel::insert_or_ignore_into(stream::table) - .values(new_stream) - .execute(conn)?) +pub fn create_stream(conn: &mut SqliteConnection, + new_stream: &StreamNew) + -> Result { + Ok(diesel::insert_or_ignore_into(stream::table).values(new_stream) + .execute(conn)?) } pub fn get_streams(conn: &mut SqliteConnection) -> Result, anyhow::Error> { @@ -29,22 +26,18 @@ pub fn get_streams(conn: &mut SqliteConnection) -> Result, anyhow::E } pub fn get_empty_streams(conn: &mut SqliteConnection) -> Result, anyhow::Error> { - Ok(stream - .filter(schema::stream::stream_link.eq("")) - .load::(conn)?) + Ok(stream.filter(schema::stream::stream_link.eq("")) + .load::(conn)?) } pub fn get_linked_streams(conn: &mut SqliteConnection) -> Result, anyhow::Error> { - Ok(stream - .filter(schema::stream::stream_link.ne("")) - .load::(conn)?) + Ok(stream.filter(schema::stream::stream_link.ne("")) + .load::(conn)?) } -pub fn get_streams_by_id( - conn: &mut SqliteConnection, - search_id: i32, -) -> Result, anyhow::Error> { - Ok(stream - .filter(schema::stream::id.eq(search_id)) - .load::(conn)?) +pub fn get_streams_by_id(conn: &mut SqliteConnection, + search_id: i32) + -> Result, anyhow::Error> { + Ok(stream.filter(schema::stream::id.eq(search_id)) + .load::(conn)?) } diff --git a/src/db/models.rs b/src/db/models.rs index a5b8331..f566752 100644 --- a/src/db/models.rs +++ b/src/db/models.rs @@ -1,9 +1,7 @@ //! This module contains the models for the diesel ORM use diesel::prelude::*; -use serde::ser::SerializeStruct; -use serde::Deserialize; -use serde::Serialize; +use serde::{ser::SerializeStruct, Deserialize, Serialize}; #[derive(Debug, Queryable, Deserialize, Clone)] pub struct Stream { @@ -65,10 +63,8 @@ mod tests { }; let serialised = serde_json::to_string(&stream).unwrap(); - assert_eq!( - serialised, - "{\"id\":1,\"home\":\"home\",\"away\":\"away\",\"start_time\":\"start_time\",\"league\":\"league\",\"country\":\"country\",\"url\":\"url\",\"stream_link\":[\"stream_link\"]}" - ); + assert_eq!(serialised, + "{\"id\":1,\"home\":\"home\",\"away\":\"away\",\"start_time\":\"start_time\",\"league\":\"league\",\"country\":\"country\",\"url\":\"url\",\"stream_link\":[\"stream_link\"]}"); } #[test] @@ -85,10 +81,8 @@ mod tests { }; let serialised = serde_json::to_string(&stream).unwrap(); - assert_eq!( - serialised, - "{\"id\":1,\"home\":\"home\",\"away\":\"away\",\"start_time\":\"start_time\",\"league\":\"league\",\"country\":\"country\",\"url\":\"url\",\"stream_link\":[\"stream_link\",\"stream_link2\"]}" - ); + assert_eq!(serialised, + "{\"id\":1,\"home\":\"home\",\"away\":\"away\",\"start_time\":\"start_time\",\"league\":\"league\",\"country\":\"country\",\"url\":\"url\",\"stream_link\":[\"stream_link\",\"stream_link2\"]}"); } #[test] @@ -105,9 +99,7 @@ mod tests { }; let serialised = serde_json::to_string(&stream).unwrap(); - assert_eq!( - serialised, - "{\"id\":1,\"home\":\"home\",\"away\":\"away\",\"start_time\":\"start_time\",\"league\":\"league\",\"country\":\"country\",\"url\":\"url\",\"stream_link\":[\"\"]}" - ); + assert_eq!(serialised, + "{\"id\":1,\"home\":\"home\",\"away\":\"away\",\"start_time\":\"start_time\",\"league\":\"league\",\"country\":\"country\",\"url\":\"url\",\"stream_link\":[\"\"]}"); } } diff --git a/src/query_selectors.rs b/src/query_selectors.rs index 1cc75e0..7166958 100644 --- a/src/query_selectors.rs +++ b/src/query_selectors.rs @@ -1,10 +1,8 @@ //! This module contains the functions to get the data from the dom of the eventlist //! and return the data as a string -use tl::Parser; -use tl::VDom; - use thiserror::Error; +use tl::{Parser, VDom}; #[derive(Error, Debug)] pub enum DomParseError { @@ -37,24 +35,24 @@ pub enum DomParseError { /// let parser = dom.parser(); /// /// let link = get_url_from_dom(&dom, &parser).unwrap(); -/// assert_eq!(link, "https://sportshub.fan/event/ypiranga_rs_novo_hamburgo_191503337/"); +/// assert_eq!(link, +/// "https://sportshub.fan/event/ypiranga_rs_novo_hamburgo_191503337/"); /// ``` pub fn get_url_from_dom(dom: &VDom<'_>, parser: &Parser<'_>) -> Result { - let q = dom - .query_selector("a") - .ok_or(DomParseError::NotFound)? - .next() - .ok_or(DomParseError::NotFound)? - .get(parser) - .ok_or(DomParseError::Unknown)? - .as_tag() - .ok_or(DomParseError::NoAttributeFound)? - .attributes() - .get("href") - .ok_or(DomParseError::NoAttributeFound)? - .ok_or(DomParseError::NoAttributeFound)? - .as_utf8_str() - .to_string(); + let q = dom.query_selector("a") + .ok_or(DomParseError::NotFound)? + .next() + .ok_or(DomParseError::NotFound)? + .get(parser) + .ok_or(DomParseError::Unknown)? + .as_tag() + .ok_or(DomParseError::NoAttributeFound)? + .attributes() + .get("href") + .ok_or(DomParseError::NoAttributeFound)? + .ok_or(DomParseError::NoAttributeFound)? + .as_utf8_str() + .to_string(); Ok(q) } @@ -81,19 +79,17 @@ pub fn get_url_from_dom(dom: &VDom<'_>, parser: &Parser<'_>) -> Result, - parser: &Parser<'_>, -) -> Result { - let q = dom - .query_selector("span.mr-5") - .ok_or(DomParseError::NotFound)? - .next() - .ok_or(DomParseError::NotFound)? - .get(parser) - .ok_or(DomParseError::Unknown)? - .inner_text(parser) - .to_string(); +pub fn get_game_name_from_dom(dom: &VDom<'_>, + parser: &Parser<'_>) + -> Result { + let q = dom.query_selector("span.mr-5") + .ok_or(DomParseError::NotFound)? + .next() + .ok_or(DomParseError::NotFound)? + .get(parser) + .ok_or(DomParseError::Unknown)? + .inner_text(parser) + .to_string(); Ok(q) } @@ -121,15 +117,14 @@ pub fn get_game_name_from_dom( /// let event_info = get_info_from_dom(&dom, &parser).unwrap(); /// assert_eq!(event_info, "Brazilian Campeonato Gaucho"); pub fn get_info_from_dom(dom: &VDom<'_>, parser: &Parser<'_>) -> Result { - let q = dom - .query_selector("span.evdesc.event-desc") - .ok_or(DomParseError::NotFound)? - .next() - .ok_or(DomParseError::NotFound)? - .get(parser) - .ok_or(DomParseError::Unknown)? - .inner_text(parser) - .to_string(); + let q = dom.query_selector("span.evdesc.event-desc") + .ok_or(DomParseError::NotFound)? + .next() + .ok_or(DomParseError::NotFound)? + .get(parser) + .ok_or(DomParseError::Unknown)? + .inner_text(parser) + .to_string(); Ok(q) } @@ -155,26 +150,25 @@ pub fn get_info_from_dom(dom: &VDom<'_>, parser: &Parser<'_>) -> Result, parser: &Parser<'_>) -> Result { - let q = dom - .query_selector("i.icon-competitions") - .ok_or(DomParseError::NotFound)? - .next() - .ok_or(DomParseError::NotFound)? - .get(parser) - .ok_or(DomParseError::Unknown)? - .as_tag() - .ok_or(DomParseError::NoAttributeFound)? - .attributes() - .get("style") - .ok_or(DomParseError::NoAttributeFound)? - .ok_or(DomParseError::NoAttributeFound)? - .as_utf8_str() - .split('/') - .last() - .ok_or(DomParseError::NoAttributeFound)? - .replace(");", "") - .replace(".svg", "") - .to_string(); + let q = dom.query_selector("i.icon-competitions") + .ok_or(DomParseError::NotFound)? + .next() + .ok_or(DomParseError::NotFound)? + .get(parser) + .ok_or(DomParseError::Unknown)? + .as_tag() + .ok_or(DomParseError::NoAttributeFound)? + .attributes() + .get("style") + .ok_or(DomParseError::NoAttributeFound)? + .ok_or(DomParseError::NoAttributeFound)? + .as_utf8_str() + .split('/') + .last() + .ok_or(DomParseError::NoAttributeFound)? + .replace(");", "") + .replace(".svg", "") + .to_string(); Ok(q) } diff --git a/src/scrape_utils.rs b/src/scrape_utils.rs index 0b814b0..7e5d156 100644 --- a/src/scrape_utils.rs +++ b/src/scrape_utils.rs @@ -2,23 +2,18 @@ //! and save them to database. It also checks the stream links and saves them //! to database. -use std::borrow::BorrowMut; - -use std::sync::Arc; -use std::sync::Mutex; -use std::thread; +use std::{ + borrow::BorrowMut, + sync::{Arc, Mutex}, + thread, +}; use anyhow::anyhow; -use diesel::ExpressionMethods; -use diesel::RunQueryDsl; -use diesel::SqliteConnection; -use headless_chrome::Browser; -use headless_chrome::Tab; +use db::{models, schema}; +use diesel::{ExpressionMethods, RunQueryDsl, SqliteConnection}; +use headless_chrome::{Browser, Tab}; -use crate::db; -use crate::query_selectors; -use db::models; -use db::schema; +use crate::{db, query_selectors}; /// This function scrapes all the games from the home page and saves them to database. /// It takes roughly 1 second to scrape ~500 games. @@ -30,13 +25,12 @@ use db::schema; pub fn today_games(tab: &Tab, conn: &mut SqliteConnection) -> Result<(), anyhow::Error> { // we navigate to the page and wait until the table showing links is loaded tab.navigate_to("https://reddit.sportshub.fan/")? - .wait_for_element(".list-events")?; + .wait_for_element(".list-events")?; // we get the html of the table and remove all the tabs and newlines - let html = tab - .find_element(".list-events")? - .get_content()? - .replace(['\t', '\n'], ""); + let html = tab.find_element(".list-events")? + .get_content()? + .replace(['\t', '\n'], ""); // create the parser using tl let dom = tl::parse(&html, tl::ParserOptions::default())?; @@ -78,8 +72,14 @@ pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow: let name = query_selectors::get_game_name_from_dom(&dom, parser)?; let teams: Vec<&str> = name.split('–').collect(); - let home = teams.first().unwrap_or(&"???").trim().to_string(); - let away = teams.last().unwrap_or(&"???").trim().to_string(); + let home = teams.first() + .unwrap_or(&"???") + .trim() + .to_string(); + let away = teams.last() + .unwrap_or(&"???") + .trim() + .to_string(); // we get the info of the game, such as time, league, country // format is: League / Time @@ -87,8 +87,12 @@ pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow: // we split the info into league and time let mut info_parsed = info.split('/'); - let league = &info_parsed.next().unwrap_or("Unknown").to_string(); - let time = info_parsed.next().unwrap_or("Unknown").to_string(); + let league = &info_parsed.next() + .unwrap_or("Unknown") + .to_string(); + let time = info_parsed.next() + .unwrap_or("Unknown") + .to_string(); // we get the country of the game // format is: @@ -96,15 +100,13 @@ pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow: // we create a new stream and save it to database // we leave stream_link empty for now - let new_stream = models::StreamNew { - home: &home, - away: &away, - start_time: &time, - league, - country: &country, - url: &url, - stream_link: "", - }; + let new_stream = models::StreamNew { home: &home, + away: &away, + start_time: &time, + league, + country: &country, + url: &url, + stream_link: "" }; db::helpers::create_stream(conn, &new_stream)?; @@ -121,12 +123,12 @@ pub fn parse_game(conn: &mut SqliteConnection, html: &str) -> Result<(), anyhow: /// *tab* - is the tab that we use to navigate to the page and scrape the links, we use headless_chrome tabs. /// *conn* - is the connection to the database, we use diesel to save the links to database. /// *url* - is the url of the game page that we get from database. -pub fn url_to_links( - tab: &Tab, - conn: &mut SqliteConnection, - url: &str, -) -> Result<(), anyhow::Error> { - tab.navigate_to(url)?.wait_for_element("#content-event")?; +pub fn url_to_links(tab: &Tab, + conn: &mut SqliteConnection, + url: &str) + -> Result<(), anyhow::Error> { + tab.navigate_to(url)? + .wait_for_element("#content-event")?; // they encode url, so we need to decode it let u = urlencoding::decode(url).unwrap(); @@ -145,19 +147,24 @@ pub fn url_to_links( // we get the links from the elements // checking if they have "//" in them because some of them are just text - let stream_links: Vec = elements - .unwrap() - .into_iter() - .map(|e| e.get_attributes().unwrap().unwrap().get(1).unwrap().clone()) - .collect(); + let stream_links: Vec = elements.unwrap() + .into_iter() + .map(|e| { + e.get_attributes() + .unwrap() + .unwrap() + .get(1) + .unwrap() + .clone() + }) + .collect(); let joined_links = stream_links.join(","); // we save the links to database - diesel::update(schema::stream::table) - .set(schema::stream::stream_link.eq(joined_links)) - .filter(schema::stream::url.eq(u)) - .execute(conn)?; + diesel::update(schema::stream::table).set(schema::stream::stream_link.eq(joined_links)) + .filter(schema::stream::url.eq(u)) + .execute(conn)?; Ok(()) } @@ -166,18 +173,18 @@ pub fn url_to_links( /// It takes roughly 27 seconds to check all the links. /// (My 8gb ram m1 macbook air with a 90mbps internet connection can handle 10 tabs relatively easily) /// It can be improved by using a shared queue instead of splitting it. -pub fn check_all_links( - browser: &Browser, - conn: &mut SqliteConnection, - tabs_count: usize, -) -> Result<(), anyhow::Error> { +pub fn check_all_links(browser: &Browser, + conn: &mut SqliteConnection, + tabs_count: usize) + -> Result<(), anyhow::Error> { // we get all the streams from database that have no links // wrap it in an arc to share it between threads let all_streams = Arc::new(db::helpers::get_empty_streams(conn)?); // we split the streams into chunks and create a thread for each chunk - let chunked_streams: Vec<&[models::Stream]> = - all_streams.chunks(all_streams.len() / tabs_count).collect(); + let chunked_streams: Vec<&[models::Stream]> = all_streams.chunks(all_streams.len() + / tabs_count) + .collect(); let length = all_streams.len(); @@ -194,29 +201,28 @@ pub fn check_all_links( tabs.push(tab.clone()); // we get the streams from the chunked streams and turn it to a vec - let mut streams = chunked_streams - .get(tab_num) - .ok_or(anyhow!("invalid chunked_stream index"))? - .to_vec() - .clone(); + let mut streams = chunked_streams.get(tab_num) + .ok_or(anyhow!("invalid chunked_stream index"))? + .to_vec() + .clone(); let completed = completed_mutex.clone(); threads.push(thread::spawn(move || { - // sqlite should be able to handle 10 connections at once - let mut conn = db::helpers::establish_connection().unwrap(); - - // we iterate over all the streams and check them - while let Some(stream) = streams.pop() { - check_link(tab.clone().borrow_mut(), &mut conn, &stream.url).unwrap(); - // we print the progress - let mut completed_count = completed - .lock() - .expect("mutex is already opened by current thread"); - *completed_count += 1; - println!("{} / {}", completed_count, length); - } - })); + // sqlite should be able to handle 10 connections at once + let mut conn = db::helpers::establish_connection().unwrap(); + + // we iterate over all the streams and check them + while let Some(stream) = streams.pop() { + check_link(tab.clone().borrow_mut(), &mut conn, &stream.url).unwrap(); + // we print the progress + let mut completed_count = + completed.lock() + .expect("mutex is already opened by current thread"); + *completed_count += 1; + println!("{} / {}", completed_count, length); + } + })); } // we wait for all the threads to finish @@ -228,19 +234,16 @@ pub fn check_all_links( let time_end = std::time::Instant::now(); - println!( - "Time elapsed to scan all games: {:?}", - time_end - time_start - ); + println!("Time elapsed to scan all games: {:?}", + time_end - time_start); Ok(()) } -pub fn check_link( - tab: &mut Arc, - conn: &mut SqliteConnection, - link: &str, -) -> Result<(), anyhow::Error> { +pub fn check_link(tab: &mut Arc, + conn: &mut SqliteConnection, + link: &str) + -> Result<(), anyhow::Error> { url_to_links(tab.borrow_mut(), conn.borrow_mut(), link).unwrap(); Ok(()) @@ -249,12 +252,10 @@ pub fn check_link( pub fn start_scraping(open_tabs: usize) -> Result<(), anyhow::Error> { // realised we didnt need adblocker when headless let browser = Browser::new({ - headless_chrome::LaunchOptions { - headless: true, - sandbox: true, - ignore_certificate_errors: true, - ..Default::default() - } + headless_chrome::LaunchOptions { headless: true, + sandbox: true, + ignore_certificate_errors: true, + ..Default::default() } })?; let mut conn = db::helpers::establish_connection()?; @@ -277,7 +278,11 @@ pub fn start_scraping(open_tabs: usize) -> Result<(), anyhow::Error> { // we close all the tabs because otherwise it shows an error when program // finishes - for t in (*browser.get_tabs().as_ref().lock().unwrap()).iter() { + for t in (*browser.get_tabs() + .as_ref() + .lock() + .unwrap()).iter() + { t.close(true)?; } diff --git a/src/web_server_utils.rs b/src/web_server_utils.rs index b6a7850..0f1871e 100644 --- a/src/web_server_utils.rs +++ b/src/web_server_utils.rs @@ -1,11 +1,11 @@ //! This module contains the web server for the API. //! It uses the rocket framework. -use crate::db; use db::models::Stream; - use rocket::{get, routes, serde::json::Json, Rocket}; +use crate::db; + #[get("/")] async fn get_all_streams() -> Json> { let mut conn = db::helpers::establish_connection().unwrap(); @@ -31,15 +31,12 @@ async fn get_stream_by_id(id: i32) -> Json> { } pub async fn run(port: u16) { - Rocket::custom(rocket::Config { - port, - ..Default::default() - }) - .mount( - "/", - routes![get_all_streams, get_active_streams, get_stream_by_id], - ) - .launch() - .await - .unwrap(); + Rocket::custom(rocket::Config { port, + ..Default::default() }).mount("/", + routes![get_all_streams, + get_active_streams, + get_stream_by_id]) + .launch() + .await + .unwrap(); }