diff --git a/README.md b/README.md index 881122a..455bf3c 100644 --- a/README.md +++ b/README.md @@ -18,22 +18,32 @@ **OpenClassrooms Python Developer Project #2: Use Python Basics for Market Analysis** -Scraping of [books.toscrape.com](https://books.toscrape.com) with **BeautifulSoup4** and **Requests**, -export data to .csv files and download cover images to *exports* folder. - _Tested on Windows 10, Python 3.9.5._ -### Post-course optimisation +### Objectives + +Scraping of [books.toscrape.com](http://books.toscrape.com) with **BeautifulSoup4** and **Requests**, +export data to .csv files and download cover images to the *"exports"* folder. + +Implementation of the ETL process: +- **E**xtract relevant and specific data from the source website; +- **T**ransform, filter and clean data; +- **L**oad data into searchable and retrievable files. + +## Post-course optimisation This project has been optimised after the end of the OpenClassrooms course. -To view the previous version, go to [this commit](https://github.com/hmignon/P2_mignon_helene/tree/163c5f5b2c730e7b308d01f31479702fb7c1e8e9). +To view the previously delivered version, please check [this commit](https://github.com/hmignon/P2_mignon_helene/tree/163c5f5b2c730e7b308d01f31479702fb7c1e8e9). Improvements made to this project include: -- Using OOP for the main scraper -- Parsing of command line arguments for options +- Using OOP for the main scraper - Optimising loops for faster execution time -- Json export +- Parsing of command line arguments for options: + - Json export option + - Ignore images option + - One-file export option +- Progress bars (tqdm) -# Setup +# Usage ### Clone the repository @@ -43,47 +53,50 @@ Improvements made to this project include: - `cd P2_mignon_helene` - `python -m venv env` -- Activate the environment `source env/bin/activate` (MacOS and Linux) or `env\Scripts\activate` (Windows) +- Activate the environment `source env/bin/activate` (macOS and Linux) or `env\Scripts\activate` (Windows) ### Install required packages - `pip install -r requirements.txt` -## Run the project +# Run the project + +To scrape the entirety of [books.toscrape.com](https://books.toscrape.com) to .csv files, +use the command `python main.py`. -In order to scrape the entirety of [books.toscrape.com](https://books.toscrape.com) to .csv files, -use the command `python main.py` +## Options -You can scrape one category via the argument `--category`. This argument takes either a **category name** or **full url**. +**Use `python main.py --help` to view all options.** + +- `--categories`: Scrape one or several categories. This argument takes **category names** and/or **full urls**. For example, the 2 following commands would yield the same results: ``` -python main.py --category travel -- OR - -python main.py --category https://books.toscrape.com/catalogue/category/books/travel_2/index.html +main.py --categories travel +main.py --categories http://books.toscrape.com/catalogue/category/books/travel_2/index.html ``` -A **json** export option has been added, as it is marginally faster than exporting to **csv**. -Both export types can be used in the same scraping process. +To scrape a selection of categories, add selected names and/or urls separated by one space. + +Note: selecting the same category several times (e.g. `python main.py --categories travel travel`) will only export data once. ``` -python main.py -j OR --json -python main.py -c OR --csv -python main.py -c -j +main.py --categories classics thriller +main.py --categories http://books.toscrape.com/catalogue/category/books/classics_6/index.html thriller ``` -Cover images download can be skipped via `--ignore-covers` +- `-c` or `--csv`: Export data to .csv files. +- `-j` or `--json`: Export data to .json files. -**Full list of optional arguments:** +Note: `-j` and `-c` can be used concurrently to export to both formats during the same scraping process. -
- -
+- `--one-file` : Export all data to a single .csv/.json file. +- `--ignore-covers`: Skip cover images downloads. -### Using csv files +## Using .csv files -If you wish to open the exported csv files in any spreadsheet software (Microsoft Excel, LibreOffice/OpenOffice Calc, Google Sheets...), +If you wish to open the exported .csv files in any spreadsheet software (Microsoft Excel, LibreOffice/OpenOffice Calc, Google Sheets...), please make sure to select the following options: - UTF-8 encoding -- comma (,) as *separator* -- double-quote (") as *string delimiter* +- comma `,` as *separator* +- double quote `"` as *string delimiter* diff --git a/img/help.png b/img/help.png deleted file mode 100644 index b1dbf1c..0000000 Binary files a/img/help.png and /dev/null differ diff --git a/main.py b/main.py index 183550a..ce91c79 100644 --- a/main.py +++ b/main.py @@ -5,19 +5,21 @@ def timer(start): + """Calculate and print scraping process time.""" end_time = int(time.time()) - start - print(f"\n\nBooks exported in {end_time // 60} mins {end_time % 60} secs.") + print(f"\n\nAll done! Books exported in {end_time // 60} mins {end_time % 60} secs.") def main(): - parser = argparse.ArgumentParser( - description="BooksToScrape", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) + """Init arg parser, and start scraper with config vars.""" + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-c", "--csv", action="store_true", help="Export to csv files") parser.add_argument("-j", "--json", action="store_true", help="Export to json files") + parser.add_argument("--one-file", action="store_true", help="Export data to one csv file") parser.add_argument("--ignore-covers", action="store_true", help="Skip cover downloads") - parser.add_argument("--category", type=str, nargs="?", default=None, help="Scrape one category (name or full url)") + parser.add_argument("--categories", type=str, nargs="+", default=None, + help="Scrape specific categories (name or full url)") args = parser.parse_args() config = vars(args) if not config["json"] and not config["csv"]: @@ -25,6 +27,9 @@ def main(): start = int(time.time()) scraper = BookScraper() + print("-" * 30) + print(" Scraping Books.ToScrape.com") + print("-" * 30) scraper.start_scraper(config) timer(start) diff --git a/scraper.py b/scraper.py index 69839f3..a04fa8d 100644 --- a/scraper.py +++ b/scraper.py @@ -10,163 +10,161 @@ class BookScraper: def __init__(self): - self.root_url = "https://books.toscrape.com/" + self.root_url = "http://books.toscrape.com/" self.root_response = requests.get(self.root_url) self.root_soup = BeautifulSoup(self.root_response.text, 'html.parser') + self.books = {} + self.categories = self.setup_categories() self.exports_dir = "exports/" - self.csv_dir = "csv/" - self.json_dir = "json/" - self.covers_dir = "covers/" + self.csv_dir = f"{self.exports_dir}csv/" + self.json_dir = f"{self.exports_dir}json/" + self.covers_dir = f"{self.exports_dir}covers/" def start_scraper(self, config): """ - Create 'exports' directory and sub-directories - Test connection - Parse config options and start scraping process - Return error for failed connection + Create 'exports' directory, + Test connection, + Parse config options and start scraping process, + Return error for failed connection, @param config: dict of config args """ try: os.mkdir(self.exports_dir) - os.mkdir(f"{self.exports_dir}{self.json_dir}") - os.mkdir(f"{self.exports_dir}{self.csv_dir}") - os.mkdir(f"{self.exports_dir}{self.covers_dir}") except FileExistsError: pass if self.root_response.status_code == 200: - print("\n- connection ok -") - categories = self.setup_categories() - - if config["category"]: - if "html" in config["category"]: - if "index" not in config["category"]: - config["category"] = config["category"][:11] + "index.html" - categories = [(name, url) for name, url in categories if url == config["category"]] - else: - categories = [(name, url) for name, url in categories if name == config["category"]] - if not categories: - print("Invalid category, please retry.") + + if config["categories"] is not None: + cat_conf = config["categories"] + for i, category in enumerate(cat_conf): + # if param is url, remove "index.html" or "page-x.html" suffix + cat_conf[i] = category.rsplit("/", 1)[0] + self.categories = [(n, u) for n, u in self.categories if n in cat_conf or u in cat_conf] + if not self.categories: + print("Invalid categories, please retry.") exit() - book_urls = self.get_book_urls(categories) - self.book_data(categories, book_urls) + self.get_book_urls() if config["json"]: - self.export_json(categories) + self.export_json(config["one_file"]) if config["csv"]: - self.export_csv(categories) + self.export_csv(config["one_file"]) if not config["ignore_covers"]: - self.download_images(categories) + self.download_images() else: - print(self.root_response.raise_for_status()) - exit() + self.connection_error(self.root_response) def setup_categories(self): """ - Get all categories names and urls + Get all categories names and urls. @return: list of categories tuples (name, url) """ - categories = [] - categories_urls = [ - self.root_url + line["href"] for line in self.root_soup.select("ul > li > ul > li > a") - ] - category_names = self.root_soup.find("ul", class_="nav nav-list").find("ul").find_all("li") - - for i, url in enumerate(categories_urls): - categories.append((category_names[i].text.strip().lower(), url)) + if self.root_response.status_code == 200: + categories = [] + categories_urls = [ + self.root_url + line["href"].rsplit("/", 1)[0] + for line in self.root_soup.select("ul > li > ul > li > a") + ] + category_names = self.root_soup.find("ul", class_="nav nav-list").find("ul").find_all("li") + for i, url in enumerate(categories_urls): + categories.append((category_names[i].text.strip().lower(), url)) + return categories - return categories + else: + self.connection_error(self.root_response) - def get_book_urls(self, categories): + def get_book_urls(self): """ - For each category, get all books urls - Check for extra pages (more than 20 books) - Clean urls and return dict by category - @param categories: list of categories tuples (name, url) - @return: dict of all book urls lists by category + For each category, get all books urls, + Check for extra pages (more than 20 books), + Clean urls and extract data. """ - book_urls = {} - - for category in tqdm(categories, desc="Loading urls", ncols=80): + for category in tqdm(self.categories, desc="Extracting data", ncols=80): + self.books[category[0]] = [] response = requests.get(category[1]) - soup = BeautifulSoup(response.text, 'html.parser') - books_total = int(soup.select_one("form > strong").text) - if books_total > 20: - page_total = int(soup.find("li", {"class": "current"}).text.replace("Page 1 of", "")) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + books_total = int(soup.select_one("form > strong").text) + if books_total > 20: + page_total = int(soup.find("li", {"class": "current"}).text.replace("Page 1 of", "")) + else: + page_total = 1 + + book_urls = [] + for i in range(page_total): + if page_total == 1: + response = requests.get(category[1]) + else: + response = requests.get(category[1] + f"/page-{i + 1}.html") + soup = BeautifulSoup(response.text, "html.parser") + book_raw_urls = [line["href"] for line in soup.select("ol > li > article > h3 > a")] + for url in book_raw_urls: + book_urls.append(url.replace("../../../", f"{self.root_url}catalogue/")) + + for book_url in tqdm(book_urls, desc=category[0].title(), ncols=80, leave=False): + self.book_data(category[0], book_url) + else: - page_total = 1 + self.connection_error(response) + + def book_data(self, category, book_url): + """ + Scrape and clean book data and add data to books instance. + @param category: current book category name + @param book_url: current book url + """ + response = requests.get(book_url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + product_info = soup.find_all('td') + description = soup.select_one("article > p").text.replace(' ...more', '') + if description.isspace(): + description = 'n/a' + img = soup.find("div", {"class": "item active"}).find("img") + img_url = img["src"].replace("../../", f"{self.root_url}") + book_data = { + 'product_page_url': book_url, + 'universal_product_code': product_info[0].text, + 'title': str(soup.find('h1').text), + 'price_including_tax': product_info[3].text, + 'price_excluding_tax': product_info[2].text, + 'number_available': re.sub(r"\D", "", product_info[5].text), + 'product_description': description, + 'category': category, + 'review_rating': + f"{self.review_rating(soup.select_one('.star-rating').attrs['class'][1])} star(s)", + 'image_url': img_url + } + self.books[category].append(book_data) - book_urls_clean = [] - for i in range(page_total): - if page_total == 1: - response = requests.get(category[1]) - else: - response = requests.get(category[1].replace("index", f"page-{i + 1}")) - soup = BeautifulSoup(response.text, "html.parser") - book_raw_urls = [line["href"] for line in soup.select("ol > li > article > h3 > a")] - for book in book_raw_urls: - book_urls_clean.append(book.replace("../../../", f"{self.root_url}catalogue/")) - - book_urls[category[0]] = book_urls_clean - - return book_urls - - def book_data(self, categories, book_urls): - """ - For each book in a category, scrape and clean book data - Add data to books instance - @param categories: list of categories tuples (name, url) - @param book_urls: dict of all book urls lists by category - """ - for i in trange(len(categories), desc="Extracting data", ncols=80): - self.books[categories[i][0]] = [] - - for url in book_urls[categories[i][0]]: - response = requests.get(url) - soup = BeautifulSoup(response.content, 'html.parser') - - product_info = soup.find_all('td') - description = soup.select_one("article > p").text.replace(' ...more', '') - if description.isspace(): - description = 'n/a' - img = soup.find("div", {"class": "item active"}).find("img") - img_url = img["src"].replace("../../", f"{self.root_url}") - book_data = { - 'product_page_url': url, - 'universal_product_code': product_info[0].text, - 'title': str(soup.find('h1').text), - 'price_including_tax': product_info[3].text, - 'price_excluding_tax': product_info[2].text, - 'number_available': re.sub(r"\D", "", product_info[5].text), - 'product_description': description, - 'category': categories[i][0], - 'review_rating': f"{self.review_rating(soup.select_one('.star-rating').attrs['class'][1])} star(s)", - 'image_url': img_url - } - self.books[categories[i][0]].append(book_data) + else: + self.connection_error(response) @staticmethod def review_rating(rating): """ - Compare star rating string to possible ratings list elements - Convert rating into integer + Compare star rating string to possible ratings list elements, + Convert rating into integer. @param rating: star rating string @return: rating type int """ - ratings = ['One', 'Two', 'Three', 'Four', 'Five'] - for i, mark in enumerate(ratings): + for i, mark in enumerate(['One', 'Two', 'Three', 'Four', 'Five']): if rating == mark: return i + 1 - def export_csv(self, categories): + def export_csv(self, one_file: bool): """ - Export book data to csv files in exports directory - @param categories: list of categories tuples (name, url) + Export book data to csv files in exports directory. + @param one_file: one file export if True """ + if not os.path.isdir(f"{self.csv_dir}"): + os.mkdir(f"{self.csv_dir}") + headers = [ 'product_page_url', 'universal_product_code', @@ -180,42 +178,70 @@ def export_csv(self, categories): 'image_url' ] - for category in tqdm(categories, desc="Exporting to csv", ncols=80): - csv_filename = category[0].lower().replace(' ', '_') + ".csv" - csv_fullpath = f"./{self.exports_dir}{self.csv_dir}{csv_filename}" + if one_file: + csv_fullpath = f"./{self.csv_dir}books.csv" with open(csv_fullpath, 'w', newline='', encoding='utf-8') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=headers) writer.writeheader() - writer.writerows(self.books[category[0]]) + for category in tqdm(self.categories, desc="Exporting to csv", ncols=80): + writer.writerows(self.books[category[0]]) - def export_json(self, categories): + else: + for category in tqdm(self.categories, desc="Exporting to csv", ncols=80): + csv_filename = category[0].lower().replace(' ', '_') + ".csv" + csv_fullpath = f"./{self.csv_dir}{csv_filename}" + with open(csv_fullpath, 'w', newline='', encoding='utf-8') as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=headers) + writer.writeheader() + writer.writerows(self.books[category[0]]) + + def export_json(self, one_file: bool): """ - Export book data to json files in exports directory - @param categories: list of categories tuples (name, url) + Export book data to json files in exports directory. + @param one_file: one file export if True """ - for category in tqdm(categories, desc="Exporting to json", ncols=80): - json_filename = category[0].lower().replace(' ', '_') + ".json" - json_fullpath = f"./{self.exports_dir}{self.json_dir}{json_filename}" - json_data = json.dumps(self.books[category[0]], indent=4) - with open(json_fullpath, "w") as json_file: - json_file.write(json_data) + if not os.path.isdir(f"{self.json_dir}"): + os.mkdir(f"{self.json_dir}") + + if one_file: + for _ in trange(1, desc="Exporting to json", ncols=80): + json_fullpath = f"./{self.json_dir}books.json" + json_data = json.dumps(self.books, indent=2) + with open(json_fullpath, "w") as json_file: + json_file.write(json_data) - def download_images(self, categories): + else: + for category in tqdm(self.categories, desc="Exporting to json", ncols=80): + json_filename = category[0].lower().replace(' ', '_') + ".json" + json_fullpath = f"./{self.json_dir}{json_filename}" + json_data = json.dumps(self.books[category[0]], indent=4) + with open(json_fullpath, "w") as json_file: + json_file.write(json_data) + + def download_images(self): """ - Create category dirs within covers directory - Set name for image files as "upc + book title" - Download cover images - @param categories: list of categories tuples (name, url) + Create category dirs within covers directory, + Set names for image files as "upc + book title", + Download cover images. """ - for category in tqdm(categories, desc="Downloading cover images", ncols=80): - img_category_dir = f"{self.exports_dir}{self.covers_dir}{category[0]}/" + if not os.path.isdir(f"{self.covers_dir}"): + os.mkdir(f"{self.covers_dir}") + + for category in tqdm(self.categories, desc="Downloading cover images", ncols=80): + img_category_dir = f"{self.covers_dir}{category[0]}/" if not os.path.isdir(img_category_dir): os.mkdir(img_category_dir) for book in self.books[category[0]]: - clean_img_name = ''.join([x for x in book['title'][:100] if x.isalnum() or x in ' ']).replace(' ', '_') - img_name = f"{book['universal_product_code']}_{clean_img_name}.jpg" - img_data = requests.get(book['image_url']) - img_path = os.path.join(img_category_dir, img_name) - with open(img_path, 'wb') as img_file: - img_file.write(img_data.content) + image = requests.get(book["image_url"]) + img_name = f"{book['universal_product_code']}.jpg" + output_path = os.path.join(img_category_dir, img_name) + with open(output_path, "wb") as f: + f.write(image.content) + + @staticmethod + def connection_error(response): + """Display error message""" + print("Connection failed, please refer to details below:") + print(response.raise_for_status()) + exit()