diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2a84d27 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +env/ +.idea +exports/ +notes.txt \ No newline at end of file diff --git a/README.md b/README.md index 4bb736c..f41449a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,12 @@ # P2_mignon_helene + **Livrable du Projet 2 du parcours D-A Python d'OpenClassrooms :** Scraping de books.toscrape.com avec BeautifulSoup4 ; exportation des infos dans fichiers .csv et des images de couverture dans dossier 'exports'. -_Notes : Ce programme invite l'utilisateur à copier l'url du site (https://books.toscrape.com/index.html) ou de la catégorie qu'il souhaite exporter. Testé sous Windows 10, Python 3.9.5._ +### Version optimisée post-formation +Application de la POO pour le scraper. Optimisation des boucles et réduction du temps d'exécution. + +_Testé sous Windows 10, Python 3.9.5._ ---------------------------------------------- ## Windows : diff --git a/books_to_scrape/book_info.py b/books_to_scrape/book_info.py deleted file mode 100644 index eb8fd12..0000000 --- a/books_to_scrape/book_info.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- - -""" ------------------------------------------------------ - EXTRACT BOOK DATA ------------------------------------------------------ -""" -import re - -import requests -from bs4 import BeautifulSoup - -from books_to_scrape.export_data import csv_file_append, download_images - - -def get_book_info(book_url, cat_name, csv_filename): - """ - Get book information and add it to a list - - @param book_url: book url string - @param cat_name: category name string - @param csv_filename: category name string extracted from category url + .csv extension - """ - response = requests.get(book_url) - soup = BeautifulSoup(response.content, 'html.parser') - - product_info = soup.find_all('td') - upc = product_info[0].text - pit = product_info[3].text - pet = product_info[2].text - - available = product_info[5].text - num_available = re.sub("[^0-9]", "", available) - - title = str(soup.find('h1').text) - - description = soup.select_one("article > p").text.replace(' ...more', '') - if description.isspace(): - description = 'n/a' - - review_rating = get_review_rating(soup.select_one('.star-rating').attrs['class'][1]) - img = soup.find("div", {"class": "item active"}).find("img") - img_url = img["src"].replace("../../", "https://books.toscrape.com/") - - info = [book_url, upc, title, pit, pet, num_available, - description, cat_name, str(review_rating) + " star(s)", img_url] - csv_file_append(csv_filename, info) - download_images(title, upc, img_url, cat_name) - - -def get_review_rating(rating): - """ - Compare star rating string to possible ratings list elements - Convert rating into integer - - @param rating: star rating string - @return: rating type int - """ - ratings = ['One', 'Two', 'Three', 'Four', 'Five'] - for i, mark in enumerate(ratings): - if rating == mark: - return i + 1 diff --git a/books_to_scrape/category_info.py b/books_to_scrape/category_info.py deleted file mode 100644 index 8cb0765..0000000 --- a/books_to_scrape/category_info.py +++ /dev/null @@ -1,87 +0,0 @@ -# -*- coding: utf-8 -*- - -""" ------------------------------------------------------ - EXTRACT CATEGORY INFO ------------------------------------------------------ -""" - -import re - -import requests -from bs4 import BeautifulSoup - -from books_to_scrape.book_info import get_book_info -from books_to_scrape.export_data import create_csv_file - - -def get_cat_name(cat_url): - """ - Get category name from category url string - - @param cat_url: category url string - @return: category name string - """ - cat_name = cat_url.replace('https://books.toscrape.com/catalogue/category/books/', '') - cat_name = cat_name.replace('/index.html', '').replace('_', '').replace('-', ' ') - cat_name = re.sub(r'[0-9]+', '', cat_name) - - print("\nExporting " + cat_name.title() + "\n") - return cat_name - - -def get_cat_pages_urls(cat_url): - """ - Get total number of books in current category - if more than 20 books: get total amount of pages - else: total = 1 page - Create .csv file - For each page, get book urls - - @param cat_url: category url string - """ - cat_name = get_cat_name(cat_url) - response = requests.get(cat_url) - soup = BeautifulSoup(response.text, 'html.parser') - books_total = int(soup.select_one("form > strong").text) - if books_total > 20: - page_total = int(soup.find("li", {"class": "current"}).text.replace("Page 1 of", "")) - else: - page_total = 1 - - csv_filename = cat_name.lower().replace(' ', '_') + ".csv" - create_csv_file(csv_filename) - - page_url = cat_url - current_cat_pages = [page_url] - for page in range(page_total): - if page == 0: - book_url_list = get_book_urls(current_cat_pages[0]) - for k in range(len(book_url_list)): - get_book_info(book_url_list[k], cat_name, csv_filename) - else: - current_cat_pages.append(page_url.replace("index", "page-" + str(page + 1))) - book_url_list = get_book_urls(current_cat_pages[page]) - for k in range(len(book_url_list)): - get_book_info(book_url_list[k], cat_name, csv_filename) - - print(str(books_total) + " book(s) exported\n\n") - print('-----------------------------------------------') - - -def get_book_urls(cat_page): - """ - For each page, add clean book url to a list - - @param cat_page: current category page url string - @return: list of book urls in page - """ - response = requests.get(cat_page) - soup = BeautifulSoup(response.text, 'html.parser') - book_url_list = [] - book_url = [line["href"] for line in soup.select("ol > li > article > h3 > a")] - for book in range(len(book_url)): - book_url_clean = book_url[book].replace("../../../", "https://books.toscrape.com/catalogue/") - book_url_list.append(book_url_clean) - - return book_url_list diff --git a/books_to_scrape/export_data.py b/books_to_scrape/export_data.py deleted file mode 100644 index 2d651b9..0000000 --- a/books_to_scrape/export_data.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- - -""" ------------------------------------------------------ - EXPORTS ------------------------------------------------------ -""" - -import os -import csv - -import requests - - -def create_csv_file(csv_filename): - """ - Create 'exports' folder, create .csv file within 'exports' folder - Write header row in .csv file - - @param csv_filename: category name extracted from category url + .csv extension - """ - directory = 'exports' - if not os.path.isdir(directory): - os.mkdir(directory) - with open('./exports/' + csv_filename, 'w', newline='', encoding='utf-8') as csv_file: - book_csv = csv.writer(csv_file, delimiter=';') - book_csv.writerow([ - 'product_page_url', - 'universal_product_code', - 'title', - 'price_including_tax', - 'price_excluding_tax', - 'number_available', - 'product_description', - 'category', - 'review_rating', - 'image_url' - ]) - - -def csv_file_append(csv_filename, info): - """ - Append extracted book info list to previously created .csv file - - @param csv_filename: category name string extracted from category url + .csv extension - @param info: list of book information - """ - with open('./exports/' + csv_filename, 'a+', newline='', encoding='utf-8') as csv_file: - book_csv = csv.writer(csv_file, delimiter=';') - book_csv.writerow(info) - - -def download_images(title, upc, img_url, cat_name): - """ - Create 'cover_images' folder within 'exports' - Create folder with current category name in 'cover_images' - Shorten and clean image file name of invalid characters - Download and save image as .jpg file - - @param title: current book title string - @param upc: universal product code string - @param img_url: current book cover image url string - @param cat_name: category name string - """ - img_directory = 'exports/cover_images/' - img_category_dir = img_directory + cat_name + '/' - img_name_clean = ''.join([x for x in title[:100] if x.isalnum() or x in ' ']).replace(' ', '_') + '.jpg' - img_filename = upc + "_" + img_name_clean - img_data = requests.get(img_url).content - - if not os.path.isdir(img_directory): - os.mkdir(img_directory) - img_path = os.path.join(img_category_dir, img_filename) - if not os.path.isdir(img_category_dir): - os.mkdir(img_category_dir) - - file = open(img_path, "wb") - file.write(img_data) diff --git a/main.py b/main.py index 54adcc7..a6f449e 100644 --- a/main.py +++ b/main.py @@ -1,84 +1,6 @@ -# -*- coding: utf-8 -*- - -""" -=================================================================== - SCRAPING BOOKS.TOSCRAPE.COM - -This program will scrape books information from books.toscrape.com -with BeautifulSoup4, export the data to .csv files and download the -cover images to an 'exports' folder -=================================================================== - -""" - import time -import requests -from bs4 import BeautifulSoup - -from books_to_scrape.category_info import get_cat_pages_urls - - -def main(): - """ - Prompt the user to choose whether they want to scrape - the entire website or only one category - Get all category urls into a list - If 1 category, compare user input to list - """ - print("\n\n-----------------------------") - print("\n Scraping books.toscrape.com\n") - print("-----------------------------\n\n") - time.sleep(1) - main_url = 'https://books.toscrape.com/' - response = requests.get(main_url) - - if response.status_code == 200: - print("\n- connection ok -") - soup = BeautifulSoup(response.text, 'html.parser') - cat_url_list = [main_url + line["href"] for line in soup.select("ul > li > ul > li > a")] - - url = input('\n\nPaste the url you would like to scrape : ') - start_time = int(time.time()) - - if url.replace('index.html', '') == main_url: - print("\nExporting all categories...\n") - for i in range(len(cat_url_list)): - get_cat_pages_urls(cat_url_list[i]) - timer(start_time) - time.sleep(1) - print('\n------END------') - - elif url in cat_url_list: - index = cat_url_list.index(url) - cat_url = cat_url_list[index] - get_cat_pages_urls(cat_url) - timer(start_time) - time.sleep(1) - print('\n------END------') - - else: - print('\n\nPlease enter a valid url (full website or one category).\n\n') - time.sleep(2) - main() - - else: - response.raise_for_status() - print("\n- connection error -") - print("Please check connection status.") - time.sleep(1) - retry = input("Retry? (y/n) :").lower().strip() - while retry != "y" != "n": - print("input error") - retry = input("Retry? (y/n) :").lower().strip() - if retry == "y": - print("Restarting...") - time.sleep(2) - main() - elif retry == "n": - print('Closing application...') - time.sleep(2) - exit() +from scraper import BookScraper def timer(start_time): @@ -90,4 +12,9 @@ def timer(start_time): if __name__ == "__main__": - main() + start_time = int(time.time()) + + scraper = BookScraper() + scraper.start_scraper() + + timer(start_time) diff --git a/requirements.txt b/requirements.txt index bbdb237..392e8c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ chardet==4.0.0 idna==2.10 requests==2.25.1 soupsieve==2.2.1 +tqdm~=4.64.0 urllib3==1.26.5 \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..56f0ee2 --- /dev/null +++ b/scraper.py @@ -0,0 +1,180 @@ +import csv +import os +import re + +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + + +class BookScraper: + def __init__(self): + self.main_url = "https://books.toscrape.com/" + self.response = requests.get(self.main_url) + self.soup = BeautifulSoup(self.response.text, 'html.parser') + self.exports_dir = "exports/" + self.csv_dir = "csv/" + self.covers_dir = "covers/" + + def start_scraper(self): + """ + Create exports directory and sub-directories + Test connection + Return error for failed connection + """ + try: + os.mkdir(self.exports_dir) + os.mkdir(f"{self.exports_dir}{self.csv_dir}") + os.mkdir(f"{self.exports_dir}{self.covers_dir}") + except FileExistsError: + pass + + if self.response.status_code == 200: + print("\n- connection ok -") + self.get_urls() + + else: + print(self.response.raise_for_status()) + exit() + + def get_urls(self): + """ + Get all categpries urls + Get all urls from each category + """ + urls = {} + book_info = {} + categories = [] + categories_urls = [ + self.main_url + line["href"] for line in self.soup.select("ul > li > ul > li > a") + ] + + print("\nGetting urls...") + for url in tqdm(categories_urls): + category_name = url.replace('https://books.toscrape.com/catalogue/category/books/', '') + category_name = category_name.replace('/index.html', '').replace('_', '').replace('-', ' ') + category_name = re.sub(r'\d+', '', category_name) + categories.append(category_name) + book_info[f"{category_name}"] = [] + + response = requests.get(url) + soup = BeautifulSoup(response.text, 'html.parser') + books_total = int(soup.select_one("form > strong").text) + if books_total > 20: + page_total = int(soup.find("li", {"class": "current"}).text.replace("Page 1 of", "")) + else: + page_total = 1 + + book_urls_clean = [] + for i in range(page_total): + if page_total == 1: + response = requests.get(url) + else: + response = requests.get(url.replace('index', f"page-{i + 1}")) + soup = BeautifulSoup(response.text, 'html.parser') + book_urls = [line["href"] for line in soup.select("ol > li > article > h3 > a")] + for book in book_urls: + book_urls_clean.append(book.replace("../../../", "https://books.toscrape.com/catalogue/")) + + urls[f"{category_name}"] = book_urls_clean + + self.book_data(urls, book_info, categories) + + def book_data(self, urls, book_info, categories): + """ + Scrape book info for each category + @param urls: dict of all books urls + @param book_info: empty dict of books data + @param categories: list of all categories names + """ + print("\nDownloading book info...") + for i in tqdm(range(len(urls))): + for url in urls[f"{categories[i]}"]: + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + + product_info = soup.find_all('td') + description = soup.select_one("article > p").text.replace(' ...more', '') + if description.isspace(): + description = 'n/a' + img = soup.find("div", {"class": "item active"}).find("img") + img_url = img["src"].replace("../../", f"{self.main_url}") + book_data = { + 'product_page_url': url, + 'universal_product_code': product_info[0].text, + 'title': str(soup.find('h1').text), + 'price_including_tax': product_info[3].text, + 'price_excluding_tax': product_info[2].text, + 'number_available': re.sub(r"\D", "", product_info[5].text), + 'product_description': description, + 'category': categories[i], + 'review_rating': f"{self.review_rating(soup.select_one('.star-rating').attrs['class'][1])} star(s)", + 'image_url': img_url + } + book_info[f"{categories[i]}"].append(book_data) + + self.export_csv(categories, book_info) + self.download_images(book_info, categories) + + @staticmethod + def review_rating(rating): + """ + Compare star rating string to possible ratings list elements + Convert rating into integer + + @param rating: star rating string + @return: rating type int + """ + ratings = ['One', 'Two', 'Three', 'Four', 'Five'] + for i, mark in enumerate(ratings): + if rating == mark: + return i + 1 + + def export_csv(self, categories, book_info): + """ + Export book data to csv files in exports directory + @param categories: list of categories names + @param book_info: dict of all books info + """ + headers = [ + 'product_page_url', + 'universal_product_code', + 'title', + 'price_including_tax', + 'price_excluding_tax', + 'number_available', + 'product_description', + 'category', + 'review_rating', + 'image_url' + ] + + print("\nExporting data...") + for category in tqdm(categories): + csv_filename = category.lower().replace(' ', '_') + ".csv" + csv_fullpath = f"./{self.exports_dir}{self.csv_dir}{csv_filename}" + with open(csv_fullpath, 'w', newline='', encoding='utf-8') as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=headers) + writer.writeheader() + writer.writerows(book_info[f"{category}"]) + + def download_images(self, book_info, categories): + """ + Download cover images to exports directory + @param book_info: dict of all books info + @param categories: list of categories names + """ + print("\nDownloading cover images...") + for category in tqdm(categories): + img_category_dir = f"{self.exports_dir}{self.covers_dir}{category}/" + if not os.path.isdir(img_category_dir): + os.mkdir(img_category_dir) + + for book in book_info[f'{category}']: + clean_img_name = ''.join([x for x in book['title'][:100] if x.isalnum() or x in ' ']).replace(' ', '_') + img_name = f"{book['universal_product_code']}_{clean_img_name}.jpg" + img_data = requests.get(book['image_url']) + + img_path = os.path.join(img_category_dir, img_name) + with open(img_path, 'wb') as handler: + handler.write(img_data.content)