Skip to content

Commit

Permalink
v1.0, added a basic code
Browse files Browse the repository at this point in the history
  • Loading branch information
Weever1337 committed Sep 17, 2024
1 parent 72b7c70 commit c3ed23a
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 0 deletions.
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ✨WebCopy

**WebCopy** is a Python project that allows you to download the contents of any website, including HTML, images, CSS, and other assets

## Features
- Asynchronous downloading of website content
- Saves all assets (HTML, CSS, images, etc.) in the appropriate folder structure
- Folder named after the website's domain (e.g., `example.com`)
- Easy to use: just enter the URL of the website

## Installation
To get started with WebCopy, you'll need to install the required dependencies. You can do this by running the following command:

```bash
pip install -r requirements.txt
```

## Usage
After installing the necessary libraries, you can run the script and input any website URL to copy its contents.

```bash
python webcopy.py
```
The script will prompt you to enter the URL of the website you want to copy. The content will be saved in a folder named after the domain of the website.

## Example
```bash
Enter the URL of the website: https://example.com
```
This will create a folder named example.com in your current directory, containing the website's HTML and assets in their respective subfolders.

## Requirements
- Python 3.7+
- aiohttp
- beautifulsoup4
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
aiohttp==3.8.5
beautifulsoup4==4.12.2
72 changes: 72 additions & 0 deletions webcopy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


async def create_directory_for_resource(resource_url, folder):
parsed_url = urlparse(resource_url)
resource_path = parsed_url.path.lstrip("/")
resource_folder = os.path.join(folder, os.path.dirname(resource_path))

if not os.path.exists(resource_folder):
os.makedirs(resource_folder)

return os.path.join(resource_folder, os.path.basename(resource_path))


async def download_page(url, folder):
if not os.path.exists(folder):
os.makedirs(folder)

async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
print(f"Error fetching {url}")
return

page_name = os.path.join(folder, "index.html")
content = await response.text()
with open(page_name, "w", encoding="utf-8") as file:
file.write(content)

soup = BeautifulSoup(content, "html.parser")
await download_resources(soup, url, folder, session)

print(f"Page {url} downloaded successfully.")


async def download_resources(soup, base_url, folder, session):
tasks = []
tags = {"img": "src", "link": "href", "script": "src"}

for tag, attr in tags.items():
for resource in soup.find_all(tag):
resource_url = resource.get(attr)
if resource_url:
full_url = urljoin(base_url, resource_url)
tasks.append(save_resource(full_url, folder, session))

await asyncio.gather(*tasks)


async def save_resource(url, folder, session):
try:
async with session.get(url) as response:
if response.status == 200:
resource_path = await create_directory_for_resource(url, folder)
content = await response.read()
with open(resource_path, "wb") as file:
file.write(content)
print(f"Resource {url} saved to {resource_path}.")
except Exception as e:
print(f"Error downloading resource {url}: {e}")


if __name__ == "__main__":
url = input("Enter the URL of the website: ")
parsed_url = urlparse(url)
folder_name = parsed_url.netloc

asyncio.run(download_page(url, folder_name))

0 comments on commit c3ed23a

Please sign in to comment.