diff --git a/README.md b/README.md new file mode 100644 index 0000000..2350e99 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# ✨WebCopy + +**WebCopy** is a Python project that allows you to download the contents of any website, including HTML, images, CSS, and other assets + +## Features +- Asynchronous downloading of website content +- Saves all assets (HTML, CSS, images, etc.) in the appropriate folder structure +- Folder named after the website's domain (e.g., `example.com`) +- Easy to use: just enter the URL of the website + +## Installation +To get started with WebCopy, you'll need to install the required dependencies. You can do this by running the following command: + +```bash +pip install -r requirements.txt +``` + +## Usage +After installing the necessary libraries, you can run the script and input any website URL to copy its contents. + +```bash +python webcopy.py +``` +The script will prompt you to enter the URL of the website you want to copy. The content will be saved in a folder named after the domain of the website. + +## Example +```bash +Enter the URL of the website: https://example.com +``` +This will create a folder named example.com in your current directory, containing the website's HTML and assets in their respective subfolders. + +## Requirements +- Python 3.7+ +- aiohttp +- beautifulsoup4 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..53bac29 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +aiohttp==3.8.5 +beautifulsoup4==4.12.2 \ No newline at end of file diff --git a/webcopy.py b/webcopy.py new file mode 100644 index 0000000..c43f74f --- /dev/null +++ b/webcopy.py @@ -0,0 +1,72 @@ +import os +import aiohttp +import asyncio +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse + + +async def create_directory_for_resource(resource_url, folder): + parsed_url = urlparse(resource_url) + resource_path = parsed_url.path.lstrip("/") + resource_folder = os.path.join(folder, os.path.dirname(resource_path)) + + if not os.path.exists(resource_folder): + os.makedirs(resource_folder) + + return os.path.join(resource_folder, os.path.basename(resource_path)) + + +async def download_page(url, folder): + if not os.path.exists(folder): + os.makedirs(folder) + + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + if response.status != 200: + print(f"Error fetching {url}") + return + + page_name = os.path.join(folder, "index.html") + content = await response.text() + with open(page_name, "w", encoding="utf-8") as file: + file.write(content) + + soup = BeautifulSoup(content, "html.parser") + await download_resources(soup, url, folder, session) + + print(f"Page {url} downloaded successfully.") + + +async def download_resources(soup, base_url, folder, session): + tasks = [] + tags = {"img": "src", "link": "href", "script": "src"} + + for tag, attr in tags.items(): + for resource in soup.find_all(tag): + resource_url = resource.get(attr) + if resource_url: + full_url = urljoin(base_url, resource_url) + tasks.append(save_resource(full_url, folder, session)) + + await asyncio.gather(*tasks) + + +async def save_resource(url, folder, session): + try: + async with session.get(url) as response: + if response.status == 200: + resource_path = await create_directory_for_resource(url, folder) + content = await response.read() + with open(resource_path, "wb") as file: + file.write(content) + print(f"Resource {url} saved to {resource_path}.") + except Exception as e: + print(f"Error downloading resource {url}: {e}") + + +if __name__ == "__main__": + url = input("Enter the URL of the website: ") + parsed_url = urlparse(url) + folder_name = parsed_url.netloc + + asyncio.run(download_page(url, folder_name))