Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
SavinDaniil authored Jun 30, 2024
1 parent c0d977a commit 6df1736
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 0 deletions.
92 changes: 92 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import os
import re
import threading
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup


def save_file(soup, pagefolder, session, url, tag, inner):
if not os.path.exists(pagefolder):
os.mkdir(pagefolder)
for res in soup.findAll(tag):
if tag == 'link' and res.has_attr('crossorigin'):
del res.attrs['crossorigin']
if tag == 'base':
res.extract()
elif tag == 'style':
if res.string:
text = res.string.strip()
try:
if 'url' in text:
index = 0
s = re.search("(url\(+)(?!\")([^)]*)", text)
while s:
urls = text[s.start() + 4 + index: s.end() + index]
filename = urls.split('/')[-1]
filepath = os.path.join(pagefolder, filename)
fileurl = urljoin(url, urls)
localpath = '../' + os.path.join(pagefolder, filename).replace('\\', '/')
text = (text[:s.start() + 4 + index] + localpath + text[s.end() - 1 + index + 1:])

if not os.path.isfile(filepath):
with open(filepath, 'wb') as f:
filebin = session.get(fileurl)
f.write(filebin.content)

index += s.end() - (len(urls) - len(localpath))
s = re.search("(url\(+)(?!\")([^)]*)", text[index:])
res.string = text
except Exception:
res.string = text

elif res.has_attr(inner):
try:
filename, ext = os.path.splitext(os.path.basename(res[inner]))
if '?' in ext:
ext = ext[:ext.find('?')]
filename = re.sub('\W+', '', filename) + ext
fileurl = urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
res[inner] = '../' + os.path.join(pagefolder, filename).replace('\\', '/')
if tag == 'img':
if res.has_attr('srcset'):
res.attrs['srcset'] = ''

if not os.path.isfile(filepath): # has not been downloaded yet
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception:
pass


def save_page(url, pagepath):
path, _ = os.path.splitext(pagepath)
# pagefolder = os.path.join('sites', f'{path}_files')
pagefolder = f'sites/{path}_files'
session = requests.Session()
try:
response = session.get(url)
except requests.exceptions.ConnectionError as exception:
raise exception

soup = BeautifulSoup(response.content.decode('utf-8'), "html.parser")
tags_inner = {'img': 'src', 'link': 'href', 'script': 'src', 'style': '', 'base': ''}
threads = []
for tag, inner in tags_inner.items(): # save and rename resource files
thread = threading.Thread(target=save_file, args=[soup, pagefolder, session, url, tag, inner])
threads.append(thread)
thread.start()
for thread in threads:
thread.join()

with open(f'templates/{path}.html', 'wb') as file:
file.write(soup.prettify('utf-8'))


# examples
# save_page('https://github.com/', 'github')


3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
beautifulsoup4==4.12.3
Requests==2.31.0
urllib3==2.2.1

0 comments on commit 6df1736

Please sign in to comment.