optimisation post-formation

hmignon · Jun 8, 2022 · 5cdbfc0 · 5cdbfc0
1 parent 163c5f5
commit 5cdbfc0
Show file tree

Hide file tree

Showing 8 changed files with 198 additions and 308 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+env/
+.idea
+exports/
+notes.txt
diff --git a/README.md b/README.md
@@ -1,8 +1,12 @@
 # P2_mignon_helene
+
 **Livrable du Projet 2 du parcours D-A Python d'OpenClassrooms :**
 Scraping de books.toscrape.com avec BeautifulSoup4 ; exportation des infos dans fichiers .csv et des images de couverture dans dossier 'exports'.
 
-_Notes : Ce programme invite l'utilisateur à copier l'url du site (https://books.toscrape.com/index.html) ou de la catégorie qu'il souhaite exporter. Testé sous Windows 10, Python 3.9.5._
+### Version optimisée post-formation
+Application de la POO pour le scraper. Optimisation des boucles et réduction du temps d'exécution. 
+
+_Testé sous Windows 10, Python 3.9.5._
 
 ----------------------------------------------
 ## Windows :

diff --git a/books_to_scrape/book_info.py b/books_to_scrape/book_info.py
diff --git a/books_to_scrape/category_info.py b/books_to_scrape/category_info.py
diff --git a/books_to_scrape/export_data.py b/books_to_scrape/export_data.py
diff --git a/main.py b/main.py
@@ -1,84 +1,6 @@
-# -*- coding: utf-8 -*-
-
-"""
-===================================================================
-                    SCRAPING BOOKS.TOSCRAPE.COM
-
-This program will scrape books information from books.toscrape.com 
-with BeautifulSoup4, export the data to .csv files and download the
-cover images to an 'exports' folder
-===================================================================
-
-"""
-
 import time
 
-import requests
-from bs4 import BeautifulSoup
-
-from books_to_scrape.category_info import get_cat_pages_urls
-
-
-def main():
-    """
-    Prompt the user to choose whether they want to scrape
-    the entire website or only one category
-    Get all category urls into a list
-    If 1 category, compare user input to list
-    """
-    print("\n\n-----------------------------")
-    print("\n Scraping books.toscrape.com\n")
-    print("-----------------------------\n\n")
-    time.sleep(1)
-    main_url = 'https://books.toscrape.com/'
-    response = requests.get(main_url)
-
-    if response.status_code == 200:
-        print("\n- connection ok -")
-        soup = BeautifulSoup(response.text, 'html.parser')
-        cat_url_list = [main_url + line["href"] for line in soup.select("ul > li > ul > li > a")]
-
-        url = input('\n\nPaste the url you would like to scrape : ')
-        start_time = int(time.time())
-
-        if url.replace('index.html', '') == main_url:
-            print("\nExporting all categories...\n")
-            for i in range(len(cat_url_list)):
-                get_cat_pages_urls(cat_url_list[i])
-            timer(start_time)
-            time.sleep(1)
-            print('\n------END------')
-
-        elif url in cat_url_list:
-            index = cat_url_list.index(url)
-            cat_url = cat_url_list[index]
-            get_cat_pages_urls(cat_url)
-            timer(start_time)
-            time.sleep(1)
-            print('\n------END------')
-
-        else:
-            print('\n\nPlease enter a valid url (full website or one category).\n\n')
-            time.sleep(2)
-            main()
-
-    else:
-        response.raise_for_status()
-        print("\n- connection error -")
-        print("Please check connection status.")
-        time.sleep(1)
-        retry = input("Retry? (y/n) :").lower().strip()
-        while retry != "y" != "n":
-            print("input error")
-            retry = input("Retry? (y/n) :").lower().strip()
-        if retry == "y":
-            print("Restarting...")
-            time.sleep(2)
-            main()
-        elif retry == "n":
-            print('Closing application...')
-            time.sleep(2)
-            exit()
+from scraper import BookScraper
 
 
 def timer(start_time):
@@ -90,4 +12,9 @@ def timer(start_time):
 
 
 if __name__ == "__main__":
-    main()
+    start_time = int(time.time())
+
+    scraper = BookScraper()
+    scraper.start_scraper()
+
+    timer(start_time)
diff --git a/requirements.txt b/requirements.txt
@@ -5,4 +5,5 @@ chardet==4.0.0
 idna==2.10
 requests==2.25.1
 soupsieve==2.2.1
+tqdm~=4.64.0
 urllib3==1.26.5