Skip to content

Commit

Permalink
Update v1.1
Browse files Browse the repository at this point in the history
- Fixed error if you unless you put https:// at the beginning of the link.
- Added a debug mode for more information about downloading
- Added logger
- Now all site will saves in copied folder
  • Loading branch information
Weever1337 committed Sep 28, 2024
1 parent c3ed23a commit ec675ed
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 75 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
copied/

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,25 @@ pip install -r requirements.txt
After installing the necessary libraries, you can run the script and input any website URL to copy its contents.

```bash
python webcopy.py
python main.py
```
The script will prompt you to enter the URL of the website you want to copy. The content will be saved in a folder named after the domain of the website.
Or you can use the command line arguments:
```bash
python main.py -u https://example.com
```
Also you can use the debug mode:
```bash
python main.py -u https://example.com -d
```

## Example
```bash
Enter the URL of the website: https://example.com
```
This will create a folder named example.com in your current directory, containing the website's HTML and assets in their respective subfolders.
This will create a folder named example.com in your "copied" directory, containing the website's HTML and assets in their respective subfolders.

## Requirements
- Python 3.7+
- Python 3.8+
- aiohttp
- beautifulsoup4
27 changes: 27 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import argparse
import asyncio
from urllib.parse import urlparse
from src.webcopy import download_page
from src import logger

async def main():
parser = argparse.ArgumentParser(description="Webcopy")
parser.add_argument(
"-u", "-l", "--url", "--link", type=str, help="The URL of the website to copy"
)
parser.add_argument(
"-d", "-debug", "--debug", action="store_true", help="Enable debug mode"
)
args = parser.parse_args()
url = args.url or input("Enter the URL of the website: ")
debug = args.debug or False
if not url.startswith(("http://", "https://")):
url = "https://" + url

folder_name = "copied/" + urlparse(url).netloc
logger.info(f"Downloading website from {url} to {folder_name}")
await download_page(url, folder_name, debug)


if __name__ == "__main__":
asyncio.run(main())
34 changes: 34 additions & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from loguru import logger
import sys


def setup_logger():
info = "<blue>{time:HH:mm:ss}</blue> => <green>{message}</green>"
error = "<red>{time:HH:mm:ss}</red> => <red>{message}</red>"
debug = "<yellow>{time:HH:mm:ss}</yellow> => <yellow>{message}</yellow>"

logger.remove()
logger.add(
sys.stdout,
colorize=True,
format=info,
level="INFO",
filter=lambda record: record["level"].name == "INFO",
)
logger.add(
sys.stdout,
colorize=True,
format=error,
level="ERROR",
filter=lambda record: record["level"].name == "ERROR",
)
logger.add(
sys.stdout,
colorize=True,
format=debug,
level="DEBUG",
filter=lambda record: record["level"].name == "DEBUG",
)


setup_logger()
69 changes: 69 additions & 0 deletions src/webcopy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import os
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from . import logger


async def create_directory_for_resource(resource_url, folder):
parsed_url = urlparse(resource_url)
resource_path = parsed_url.path.lstrip("/")
resource_folder = os.path.join(folder, os.path.dirname(resource_path))

if not os.path.exists(resource_folder):
os.makedirs(resource_folder)

return os.path.join(resource_folder, os.path.basename(resource_path))


async def download_page(url, folder, debug):
os.makedirs(folder, exist_ok=True)
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
logger.error(f"Error fetching {url}")
return

content = await response.text()
with open(
os.path.join(folder, "index.html"), "w", encoding="utf-8"
) as file:
file.write(content)

await download_resources(
BeautifulSoup(content, "html.parser"), url, folder, session, debug
)
except Exception as e:
logger.error(f"Error downloading page {url}: {e}")
if debug:
from traceback import format_exc

logger.error(format_exc())
return

logger.info(f"Page {url} downloaded successfully.")


async def download_resources(soup, base_url, folder, session, debug):
tasks = [
save_resource(urljoin(base_url, resource.get(attr)), folder, session, debug)
for tag, attr in {"img": "src", "link": "href", "script": "src"}.items()
for resource in soup.find_all(tag)
if resource.get(attr)
]
debug and logger.debug(f"Downloading {len(tasks)} resources")
await asyncio.gather(*tasks)


async def save_resource(url, folder, session, debug):
try:
async with session.get(url) as response:
if response.status == 200:
resource_path = await create_directory_for_resource(url, folder)
with open(resource_path, "wb") as file:
file.write(await response.read())
debug and logger.debug(f"Resource {url} saved to {resource_path}")
except Exception as e:
logger.error(f"Error downloading resource {url}: {e}")
72 changes: 0 additions & 72 deletions webcopy.py

This file was deleted.

0 comments on commit ec675ed

Please sign in to comment.