v1.0, added a basic code

Weever1337 · Sep 17, 2024 · c3ed23a · c3ed23a
1 parent 72b7c70
commit c3ed23a
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# ✨WebCopy
+
+**WebCopy** is a Python project that allows you to download the contents of any website, including HTML, images, CSS, and other assets
+
+## Features
+- Asynchronous downloading of website content
+- Saves all assets (HTML, CSS, images, etc.) in the appropriate folder structure
+- Folder named after the website's domain (e.g., `example.com`)
+- Easy to use: just enter the URL of the website
+
+## Installation
+To get started with WebCopy, you'll need to install the required dependencies. You can do this by running the following command:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+After installing the necessary libraries, you can run the script and input any website URL to copy its contents.
+
+```bash
+python webcopy.py
+```
+The script will prompt you to enter the URL of the website you want to copy. The content will be saved in a folder named after the domain of the website.
+
+## Example
+```bash
+Enter the URL of the website: https://example.com
+```
+This will create a folder named example.com in your current directory, containing the website's HTML and assets in their respective subfolders.
+
+## Requirements
+- Python 3.7+
+- aiohttp
+- beautifulsoup4
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+aiohttp==3.8.5
+beautifulsoup4==4.12.2
diff --git a/webcopy.py b/webcopy.py
@@ -0,0 +1,72 @@
+import os
+import aiohttp
+import asyncio
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+
+
+async def create_directory_for_resource(resource_url, folder):
+    parsed_url = urlparse(resource_url)
+    resource_path = parsed_url.path.lstrip("/")
+    resource_folder = os.path.join(folder, os.path.dirname(resource_path))
+
+    if not os.path.exists(resource_folder):
+        os.makedirs(resource_folder)
+
+    return os.path.join(resource_folder, os.path.basename(resource_path))
+
+
+async def download_page(url, folder):
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            if response.status != 200:
+                print(f"Error fetching {url}")
+                return
+
+            page_name = os.path.join(folder, "index.html")
+            content = await response.text()
+            with open(page_name, "w", encoding="utf-8") as file:
+                file.write(content)
+
+            soup = BeautifulSoup(content, "html.parser")
+            await download_resources(soup, url, folder, session)
+
+    print(f"Page {url} downloaded successfully.")
+
+
+async def download_resources(soup, base_url, folder, session):
+    tasks = []
+    tags = {"img": "src", "link": "href", "script": "src"}
+
+    for tag, attr in tags.items():
+        for resource in soup.find_all(tag):
+            resource_url = resource.get(attr)
+            if resource_url:
+                full_url = urljoin(base_url, resource_url)
+                tasks.append(save_resource(full_url, folder, session))
+
+    await asyncio.gather(*tasks)
+
+
+async def save_resource(url, folder, session):
+    try:
+        async with session.get(url) as response:
+            if response.status == 200:
+                resource_path = await create_directory_for_resource(url, folder)
+                content = await response.read()
+                with open(resource_path, "wb") as file:
+                    file.write(content)
+                print(f"Resource {url} saved to {resource_path}.")
+    except Exception as e:
+        print(f"Error downloading resource {url}: {e}")
+
+
+if __name__ == "__main__":
+    url = input("Enter the URL of the website: ")
+    parsed_url = urlparse(url)
+    folder_name = parsed_url.netloc
+
+    asyncio.run(download_page(url, folder_name))