-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
72b7c70
commit c3ed23a
Showing
3 changed files
with
109 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# ✨WebCopy | ||
|
||
**WebCopy** is a Python project that allows you to download the contents of any website, including HTML, images, CSS, and other assets | ||
|
||
## Features | ||
- Asynchronous downloading of website content | ||
- Saves all assets (HTML, CSS, images, etc.) in the appropriate folder structure | ||
- Folder named after the website's domain (e.g., `example.com`) | ||
- Easy to use: just enter the URL of the website | ||
|
||
## Installation | ||
To get started with WebCopy, you'll need to install the required dependencies. You can do this by running the following command: | ||
|
||
```bash | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Usage | ||
After installing the necessary libraries, you can run the script and input any website URL to copy its contents. | ||
|
||
```bash | ||
python webcopy.py | ||
``` | ||
The script will prompt you to enter the URL of the website you want to copy. The content will be saved in a folder named after the domain of the website. | ||
|
||
## Example | ||
```bash | ||
Enter the URL of the website: https://example.com | ||
``` | ||
This will create a folder named example.com in your current directory, containing the website's HTML and assets in their respective subfolders. | ||
|
||
## Requirements | ||
- Python 3.7+ | ||
- aiohttp | ||
- beautifulsoup4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
aiohttp==3.8.5 | ||
beautifulsoup4==4.12.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import os | ||
import aiohttp | ||
import asyncio | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urljoin, urlparse | ||
|
||
|
||
async def create_directory_for_resource(resource_url, folder): | ||
parsed_url = urlparse(resource_url) | ||
resource_path = parsed_url.path.lstrip("/") | ||
resource_folder = os.path.join(folder, os.path.dirname(resource_path)) | ||
|
||
if not os.path.exists(resource_folder): | ||
os.makedirs(resource_folder) | ||
|
||
return os.path.join(resource_folder, os.path.basename(resource_path)) | ||
|
||
|
||
async def download_page(url, folder): | ||
if not os.path.exists(folder): | ||
os.makedirs(folder) | ||
|
||
async with aiohttp.ClientSession() as session: | ||
async with session.get(url) as response: | ||
if response.status != 200: | ||
print(f"Error fetching {url}") | ||
return | ||
|
||
page_name = os.path.join(folder, "index.html") | ||
content = await response.text() | ||
with open(page_name, "w", encoding="utf-8") as file: | ||
file.write(content) | ||
|
||
soup = BeautifulSoup(content, "html.parser") | ||
await download_resources(soup, url, folder, session) | ||
|
||
print(f"Page {url} downloaded successfully.") | ||
|
||
|
||
async def download_resources(soup, base_url, folder, session): | ||
tasks = [] | ||
tags = {"img": "src", "link": "href", "script": "src"} | ||
|
||
for tag, attr in tags.items(): | ||
for resource in soup.find_all(tag): | ||
resource_url = resource.get(attr) | ||
if resource_url: | ||
full_url = urljoin(base_url, resource_url) | ||
tasks.append(save_resource(full_url, folder, session)) | ||
|
||
await asyncio.gather(*tasks) | ||
|
||
|
||
async def save_resource(url, folder, session): | ||
try: | ||
async with session.get(url) as response: | ||
if response.status == 200: | ||
resource_path = await create_directory_for_resource(url, folder) | ||
content = await response.read() | ||
with open(resource_path, "wb") as file: | ||
file.write(content) | ||
print(f"Resource {url} saved to {resource_path}.") | ||
except Exception as e: | ||
print(f"Error downloading resource {url}: {e}") | ||
|
||
|
||
if __name__ == "__main__": | ||
url = input("Enter the URL of the website: ") | ||
parsed_url = urlparse(url) | ||
folder_name = parsed_url.netloc | ||
|
||
asyncio.run(download_page(url, folder_name)) |