tests added

mahmudsudo · Oct 11, 2024 · 932d153 · 932d153
1 parent f8feb44
commit 932d153
Show file tree

Hide file tree

Showing 9 changed files with 345 additions and 88 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,13 @@
 [package]
-name = "web_crawler_x"
+name = "webcrawlerx"
 version = "0.1.0"
 edition = "2021"
-
+authors = ["Mahmud Bello <[email protected]>"]
+description = "A flexible and efficient web crawler written in Rust"
+license = "MIT"
+repository = "https://github.com/mahmudsudo/-WebCrawlerX-"
+keywords = ["web", "crawler", "spider", "scraping"]
+categories = ["web-programming"]
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]

diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Mahmud Bello
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,59 +1,48 @@
 🕷️ WebCrawlerX 🚀
 
-Discover the hidden treasures of the internet with WebCrawlerX - your ultimate web crawling and scraping companion! 🌐
+A flexible and efficient web crawler written in Rust.
 
-Unleash the power of this versatile and efficient web crawler to extract valuable data from websites, be it for competitive analysis, market research, content aggregation, or any other data-driven application. With WebCrawlerX, you can effortlessly traverse the vast expanse of the internet and collect structured information in real-time.
+## Features
 
-🌟 Key Features 🌟
-- Lightning-fast Crawling: Experience blazing speeds with our optimized crawling algorithms, ensuring swift data retrieval.
-- Smart Parsing: Seamlessly extract relevant content using intelligent parsing techniques, handling different data structures with ease.
-- Customizable Configurations: Tailor your crawling behavior with customizable settings for URLs, headers, rate limits, and more.
-- User-Friendly Interface: Intuitive and easy-to-use interface for both beginners and advanced users.
-- Scalable & Concurrent: Harness the power of concurrency to crawl multiple websites simultaneously, saving you valuable time and resources.
-- Export & Store Data: Save extracted data in various formats (JSON, CSV, XML) or store directly in your preferred database.
+- Multiple spider implementations (CVE Details, GitHub, Quotes)
+- Configurable crawling parameters (delay, concurrent requests, page limit)
+- Easy to extend with new spiders
 
+## Installation
 
-🛡️ Stay Ethical, Respect Robots.txt 🛡️
-WebCrawlerX adheres to web crawling ethics, respecting the `robots.txt` protocol to avoid unwanted access. Always use the tool responsibly and follow best practices to avoid putting unnecessary strain on servers.
-
-🚀 Join the Community 🚀
-We believe in the power of collaboration. Join our vibrant community of developers, data enthusiasts, and researchers. Share your experiences, seek help, and contribute to the continuous improvement of WebCrawlerX.
-
-Start exploring the untapped potential of the web today. Let WebCrawlerX empower your data-driven journey!
-
-🐦 Follow us on Twitter: @BelloMahmud6
-💼 Find us on LinkedIn: https://www.linkedin.com/in/bello-m-613575207/
-
-#webcrawler #webscraping #datamining #webdata #rust #opensource
+```bash
+cargo install webcrawlerx
+```
 
 
+## Usage
 
-🔧 Installation & Usage 🔧
-Get started with WebCrawlerX in minutes! Clone the repository, install dependencies, and begin your web crawling adventure. Our comprehensive documentation and code examples ensure a smooth onboarding experience.
+List available spiders:
+```bash
+webcrawlerx spiders
+```
 
+Run a specific spider:
+```bash
+webcrawlerx run --spider <spider_name>
+--spider <spider_name> [--delay <ms>] [--concurrent <num>] [--limit <num>]
+```
 
-## Usage
 
-```shell
-$ cargo run -- spiders
-$ cargo run -- run --spider cvedetails
+Example:
+```bash
+webcrawlerx run --spider cvedetails --delay 200 --concurrent 2 --limit 10
 ```
 
-## fmt
 
-```shell
-$ cargo fmt
-```
+## Adding a New Spider
 
-## Install chromedriver
+To add a new spider, create a new module in the `spiders` directory and implement the `Spider` trait. Then, update the `run_spider` function in `main.rs` to include your new spider.
 
-```shell
-$ sudo apt install chromium-browser chromium-chromedriver
-```
+## Contributing
 
+Contributions are welcome! Please feel free to submit a Pull Request.
 
-### Run chromedriver
+## License
 
-```shell
-$ chromedriver --port=4444 --disable-dev-shm-usage
-```
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
diff --git a/src/crawler.rs b/src/crawler.rs
@@ -13,13 +13,21 @@ use tokio::{
     time::sleep,
 };
 
+/// Represents a web crawler with configurable parameters.
 pub struct Crawler {
     delay: Duration,
     crawling_concurrency: usize,
     processing_concurrency: usize,
 }
 
 impl Crawler {
+    /// Creates a new Crawler instance.
+    ///
+    /// # Arguments
+    ///
+    /// * `delay` - The delay between requests.
+    /// * `concurrent_requests` - The number of concurrent requests allowed.
+    /// * `page_limit` - The maximum number of pages to crawl.
     pub fn new(
         delay: Duration,
         crawling_concurrency: usize,
@@ -32,6 +40,11 @@ impl Crawler {
         }
     }
 
+    /// Runs the crawler with the given spider.
+    ///
+    /// # Arguments
+    ///
+    /// * `spider` - An Arc-wrapped instance of a struct implementing the Spider trait.
     pub async fn run<T: Send + 'static>(&self, spider: Arc<dyn Spider<Item = T>>) {
         let mut visited_urls = HashSet::<String>::new();
         let crawling_concurrency = self.crawling_concurrency;