Skip to content

Commit

Permalink
Merge pull request #47 from Xpirix/fetch_all_resources
Browse files Browse the repository at this point in the history
Fetch all resources
  • Loading branch information
Xpirix authored Feb 10, 2025
2 parents 2a450b0 + a92df37 commit cf690f1
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 54 deletions.
3 changes: 2 additions & 1 deletion REQUIREMENTS.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
feedparser==6.0.11
requests==2.32.3
pillow==11.0.0
python-dateutil==2.9.0.post0
python-dateutil==2.9.0.post0
beautifulsoup4==4.13.3
98 changes: 86 additions & 12 deletions fetch_feeds.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
import os
import json
from urllib.parse import urlparse
import string
import random
import requests
import shutil
from datetime import datetime
from scripts.resize_image import resize_image
from scripts.resize_image import resize_image, convert_to_webp, is_valid_image, is_valid_svg
from dateutil.parser import parse as date_parse
from bs4 import BeautifulSoup

# Path to the subscribers.json file
SUBSCRIBERS_JSON_PATH = os.path.join(os.path.dirname(__file__), 'data', 'subscribers.json')
Expand Down Expand Up @@ -59,15 +62,75 @@ def fetch_and_create_post(self):
except Exception as e:
print(f"Failed to process feed for {self.subscriber_name}: {e}")

def fetch_all_images(self, content, subscriber_shortname, post_name):
img_folder = os.path.join("img", "subscribers", subscriber_shortname, post_name)
soup = BeautifulSoup(content, 'html.parser')
unknown_img_folder = os.path.join("static", img_folder, "unknown")

if os.path.exists(unknown_img_folder):
shutil.rmtree(unknown_img_folder)
os.makedirs(unknown_img_folder, exist_ok=True)

for img in soup.find_all('img'):
img_url = img['src']
file_name = self.get_image_name(img_url.split('?')[0])
try:
downloaded_img = self.download_and_process_image(img_url, file_name, img_folder, unknown_img_folder)
img['src'] = downloaded_img
except Exception as e:
img['src'] = ""
print(f"Failed to process image: {e}")

for video in soup.find_all('video'):
video_url = video.find('source')['src']
video.replace_with(soup.new_tag('a', href=video_url, target="_blank", string="Watch Video"))

return str(soup)

def download_and_process_image(self, img_url, file_name, img_folder, unknown_img_folder):
no_param_url = img_url.split('?')[0] # Remove query parameters
if no_param_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')):
downloaded_img = self.download_image(no_param_url, file_name, os.path.join("static", img_folder))
if not is_valid_image(downloaded_img):
os.remove(downloaded_img)
raise Exception(f"Invalid image: {downloaded_img}")
resize_image(downloaded_img, max_height=600)
webp_img_path = convert_to_webp(downloaded_img, replace=True)
return os.path.join("/", img_folder, os.path.basename(webp_img_path))
elif no_param_url.lower().endswith('.svg'):
downloaded_img = self.download_image(no_param_url, file_name, os.path.join("static", img_folder))
if not is_valid_svg(downloaded_img):
os.remove(downloaded_img)
raise Exception(f"Invalid image: {downloaded_img}")
return os.path.join("/", img_folder, file_name)
else:
downloaded_img = self.handle_unknown_image_format(img_url, unknown_img_folder)
return os.path.join("/", img_folder, "unknown", os.path.basename(downloaded_img))

def handle_unknown_image_format(self, img_url, dest_folder):
"""
Handle unknown image formats by downloading the image and converting it to webp format.
"""
prefix = ''.join(random.choices(string.ascii_letters + string.digits, k=8))
file_name = f"image_{prefix}.png"

downloaded_img = self.download_image(
img_url,
file_name,
dest_folder,
is_unknown=True
)
if not is_valid_image(downloaded_img):
os.remove(downloaded_img)
raise Exception(f"Invalid image: {downloaded_img}")
resize_image(downloaded_img, max_height=600)
return convert_to_webp(downloaded_img, replace=True)


def process_entry(self, entry):
try:
dest_folder = self.get_dest_folder()
title = entry.title
# I don't think we need to download images because the images are already in the feed
# image_url = next((link.href for link in entry.links if 'image' in link.type), entry.links[-1].href)
# if image_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp')):
# file_name = self.get_image_name(image_url)
# self.download_image(image_url, file_name, dest_folder)

post_url = entry.link

Expand All @@ -81,6 +144,7 @@ def process_entry(self, entry):

are_tags_present = any(str(category).lower() in tags for category in self.filter_categories)
if are_tags_present:
content = self.fetch_all_images(content, self.shortname, file_name)
content = self.generate_markdown_content(title, entry_date, post_url, content, tags)

# Copy the markdown file to the posts folder
Expand Down Expand Up @@ -169,12 +233,20 @@ def write_to_file(self, filename, content):
with open(filename, "w", encoding="utf=8") as f:
f.write(content)

def download_image(self, image_url, image_name, dest_folder):
response = requests.get(image_url, stream=True)
def download_image(self, image_url, image_name, dest_folder, is_unknown=False):
os.makedirs(dest_folder, exist_ok=True)
image_filename = os.path.join(dest_folder, image_name)
with open(image_filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(f"Writing: {image_filename}")
if is_unknown:
response = requests.get(image_url, stream=True)
with open(image_filename, "wb") as file:
for chunk in response.iter_content(1024):
file.write(chunk)
else:
response = requests.get(image_url, stream=True)
content = response.raw
with open(image_filename, 'wb') as out_file:
shutil.copyfileobj(content, out_file)
return image_filename


class FunderProcessor:
Expand Down Expand Up @@ -265,10 +337,11 @@ def process_funder(item):
print(f"Failed to delete {file_path}. Reason: {e}")

# Iterate over the subscribers and fetch posts for active ones
i = 1
for subscriber in subscribers:
if not subscriber.get('is_active'):
continue

print(f"{i}/{len(subscribers)}: Processing feed for {subscriber['name']}")
languages = subscriber.get('languages', {})
available_lang = languages.get('available', DEFAULT_AVAILABLE_LANG)
main_lang = languages.get('main', DEFAULT_MAIN_LANG)
Expand All @@ -283,5 +356,6 @@ def process_funder(item):
filter_categories
)
processor.fetch_and_create_post()
i += 1

# FunderProcessor.fetch_funders()
118 changes: 84 additions & 34 deletions scripts/resize_image.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from PIL import Image
import os
import xml.etree.ElementTree as ET


def resize_image(image_filename, max_height=120):
Expand All @@ -8,38 +9,87 @@ def resize_image(image_filename, max_height=120):
The image is resized in place.
param image_filename: The image file to resize
param max_height: The maximum height in pixels
TODO: Add support for other image formats
"""
if (
image_filename.lower().endswith('.png') or
image_filename.lower().endswith('.jpg')
):
if os.path.exists(image_filename):
print(f'Processing: {image_filename}')
with Image.open(image_filename) as img:
width, height = img.size
if height > max_height:
new_height = max_height
new_width = int((new_height / height) * width)

img_resized = img.resize(
(new_width, new_height), Image.LANCZOS
)

# Determine the file format
file_format = (
'PNG' if image_filename.lower().endswith('.png')
else 'JPEG'
)

# Save the resized image with optimization
img_resized.save(
image_filename,
format=file_format,
optimize=True,
quality=85
)
print(f'Resized and optimized: {image_filename}')
else:
print(f'No resizing needed for: {image_filename}')
else:
print(f'File not found: {image_filename}')
if os.path.exists(image_filename):
with Image.open(image_filename) as img:
width, height = img.size
if height > max_height:
new_height = max_height
new_width = int((new_height / height) * width)

img_resized = img.resize(
(new_width, new_height), Image.LANCZOS
)

# Determine the file format
file_format = image_filename.split('.')[-1].upper()
if file_format == 'JPG':
file_format = 'JPEG'

# Save the resized image with optimization
img_resized.save(
image_filename,
format=file_format,
optimize=True,
quality=85
)
else:
print(f'File not found: {image_filename}')

# Transform an image into webp format
def convert_to_webp(image_filename, replace=False):
"""
Convert an image to webp format.
The image is converted in place.
param image_filename: The image file to convert
"""
supported_formats = ['.png', '.jpg', '.jpeg', '.tiff']
image_ext = os.path.splitext(image_filename)[1].lower()
if image_ext not in supported_formats:
return image_filename
if os.path.exists(image_filename):
with Image.open(image_filename) as img:
# Determine the file format
file_format = image_filename.split('.')[-1].upper()

# Save the image in webp format with optimization
webp_filename = image_filename.replace(file_format.lower(), 'webp')
img.save(
webp_filename,
format='WEBP',
optimize=True,
quality=85
)
if replace:
os.remove(image_filename)
return webp_filename
else:
print(f'File not found: {image_filename}')
raise FileNotFoundError

# Check if the image is valid
def is_valid_image(image_filename):
"""
Check if the image file is valid.
param image_filename: The image file to check
return: True if the image is valid, False otherwise
"""
try:
img = Image.open(image_filename)
img.verify()
return True
except Exception as e:
print(f'Invalid image: {image_filename}')

def is_valid_svg(svg_filename):
"""
Check if the svg file is valid.
param svg_filename: The svg file to check
return: True if the svg is valid, False otherwise
"""
try:
ET.parse(svg_filename) # Try to parse the XML
return True # No error means it's valid
except ET.ParseError:
return False # If parsing fails, it's invalid
4 changes: 4 additions & 0 deletions themes/hugo-bulma-blocks-theme/assets/sass/bulma.sass
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
{{ $truenoSBd := resources.Get "webfonts/TruenoSBd.otf" }}
{{ $truenoBd := resources.Get "webfonts/TruenoBd.otf" }}
{{ $truenoUltBlk := resources.Get "webfonts/TruenoUltBlk.otf" }}
{{ $countryFlagsEmoji := resources.Get "webfonts/TwemojiCountryFlags.woff2" }}

@font-face
font-family: 'Montserrat'
Expand Down Expand Up @@ -62,6 +63,9 @@
src: url("{{ $truenoUltBlk.RelPermalink }}") format("opentype")
font-weight: 700

@font-face
font-family: "Twemoji Country Flags"
src: url("{{ $countryFlagsEmoji.RelPermalink }}") format("opentype")

{{ $worksans := resources.Get "webfonts/worksans.woff2" }}

Expand Down
Binary file not shown.
7 changes: 0 additions & 7 deletions themes/hugo-bulma-blocks-theme/layouts/partials/header.html
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,6 @@
src="{{ .Site.Params.uniNavHeaderUrl }}"
></script>

<!-- Countries Flag for windows -->
<!-- Added by Lova -->
<!-- See https://github.com/talkjs/country-flag-emoji-polyfill -->
<script type="module" defer>
import { polyfillCountryFlagEmojis } from "https://cdn.skypack.dev/country-flag-emoji-polyfill";
polyfillCountryFlagEmojis();
</script>
</head>

<body></body>
Expand Down

0 comments on commit cf690f1

Please sign in to comment.