Skip to content

Commit

Permalink
feat: 2024 update
Browse files Browse the repository at this point in the history
Fix immoscout24.ch, add cycling + public transport to google distance matrix api calls, update embeds.

Fix correct openssl version such that the scraped data is not masked / scrambled.

Fix maximum file amount in embed to 4
  • Loading branch information
dvdblk committed Jul 6, 2024
1 parent 192cc48 commit 7d4d58d
Show file tree
Hide file tree
Showing 10 changed files with 165 additions and 63 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
.env
.env.dev
**.env**

venv/

.vscode/*
Expand Down
46 changes: 41 additions & 5 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
"""App module"""
from typing import Any, Dict

import logging
import ssl
import sys
import time

import aiohttp

Expand All @@ -27,9 +31,41 @@ def setup_custom_logger(name) -> logging.Logger:


def init_client_session() -> aiohttp.ClientSession:
"""Create ClientSession with headers"""
return aiohttp.ClientSession(
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/98.0"
}
"""Create ClientSession with no-cache headers"""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:127.0) Gecko/20100101 Firefox/127.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br, zstd",
"DNT": "1",
"Sec-GPC": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Priority": "u=1",
"TE": "trailers"
}

async def add_nocache_headers(session, trace_config_ctx, params):
params.headers.update({
"X-No-Cache": str(time.time())
})

trace_config = aiohttp.TraceConfig()
trace_config.on_request_start.append(add_nocache_headers)

ssl_context = ssl.create_default_context()
ssl_context.set_ciphers("DEFAULT@SECLEVEL=1")
ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2
ssl_context.maximum_version = ssl.TLSVersion.TLSv1_2
tcp_connector = aiohttp.TCPConnector(
ssl=ssl_context,
use_dns_cache=False,
ttl_dns_cache=300,
limit=100
)

return aiohttp.ClientSession(headers=headers, trace_configs=[trace_config], connector=tcp_connector)
3 changes: 2 additions & 1 deletion app/immo/model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataclasses import dataclass, fields, _MISSING_TYPE
from enum import Enum
from typing import List
from typing import List, Optional


class ImmoPriceKind(Enum):
Expand All @@ -21,6 +21,7 @@ class ImmoData:
rooms: str = "-"
living_space: str = "-"
currency: str = "CHF"
lister_logo_url: Optional[str] = None

def __post_init__(self):
# Set default values properly for None input arguments
Expand Down
67 changes: 67 additions & 0 deletions app/immo/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,73 @@ class ImmoParser:
def _parse_immoscout24(html: BeautifulSoup) -> List[ImmoData]:
"""Parse immoscout24.ch listings
Returns:
list[ImmoData]: ImmoData of listings on the immo website
"""
# try to get the json that is inside of a specific script tag
json_raw_data = html.find(
lambda tag: tag.name == "script" and \
tag.string and \
tag.string.startswith("window.__INITIAL_STATE__=")
).text.lstrip("window.__INITIAL_STATE__=")

# cleaning up not needed so just load the json
listings_json = json.loads(json_raw_data)

try:
listings = listings_json["resultList"]["search"]["fullSearch"]["result"]["listings"]

except KeyError:
raise ImmoParserError("Listings json path changed.")

immo_data_list = []
for listing in listings:
if lister_logo_url := listing.get("listerBranding"):
lister_logo_url = lister_logo_url.get("logoUrl")

# unwrap the listing
listing = listing["listing"]
primary_localization = listing["localization"]["primary"]
try:
title = listing["localization"][primary_localization]["text"]["title"]
loc = listing["address"]["locality"]
plz = listing["address"]["postalCode"]
street = listing["address"]["street"]
address = f"{street}, {plz} {loc}"
except KeyError:
address = None
title = "Wohnung"
url = f"/rent/{listing.get('id')}"
rent = listing.get("prices").get("rent").get("gross") # gross
rooms = listing.get("characteristics").get("numberOfRooms")
living_space = listing.get("characteristics").get("livingSpace")

images = []
for attachment in listing["localization"][primary_localization]["attachments"]:
if attachment["type"] == "IMAGE":
images.append(attachment["url"].encode().decode("unicode-escape"))

immo_data_list.append(
ImmoData(
title=title,
address=address,
url=url,
price=rent,
rooms=rooms,
living_space=living_space,
images=images,
lister_logo_url=lister_logo_url
)
)

return immo_data_list

@staticmethod
def _parse_immoscout24_old(html: BeautifulSoup) -> List[ImmoData]:
"""DEPRECATED, worked for older version of immoscout24.ch
Parse immoscout24.ch listings
Returns:
list[ImmoData]: ImmoData of listings on the immo website
"""
Expand Down
7 changes: 3 additions & 4 deletions app/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ async def _send_discord_message(self, listing):
if self.google_maps_api_key:
# Compute the distance from apartment address to the destination address
# in this case, default destination address = 'Rämistrasse, Zürich, Switzerland'
distance, duration = await compute_distance(
distance_results = await compute_distance(
self.scraper.session,
self.google_maps_api_key,
origin_address=listing.address,
destination_address=self.google_maps_destination_address,
)
else:
distance, duration = None, None
distance_results = None

await send_discord_listing_embed(
self.discord,
Expand All @@ -78,8 +78,7 @@ async def _send_discord_message(self, listing):
hostname=self.immo_website.value,
host_url=self.immo_website_url,
host_icon_url=self.immo_website.author_icon_url,
immo_distance=distance,
immo_duration=duration,
immo_distances=distance_results,
)
self.logger.debug("sent %s", listing.url)

Expand Down
27 changes: 18 additions & 9 deletions app/utils/discord.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from datetime import datetime
from functools import reduce
from io import BytesIO
from typing import List, Tuple
from typing import Dict, List, Tuple

import aiohttp
from discord import Embed, File, Webhook
Expand Down Expand Up @@ -55,8 +55,7 @@ async def send_discord_listing_embed(
hostname: str,
host_url: str,
host_icon_url: str,
immo_distance: str,
immo_duration: str,
immo_distances: Dict[str, Tuple[str, str]]
):
"""Sends an embed message from listing (immo) data"""
embeds = []
Expand All @@ -71,16 +70,26 @@ async def send_discord_listing_embed(
embed.add_field(name=immo_data.price_kind.value, value=immo_data.price, inline=True)
embed.add_field(name="Rooms", value=immo_data.rooms, inline=True)
embed.add_field(name="Living space", value=immo_data.living_space, inline=True)
if immo_distance and immo_duration:
# Add embed for distance and travel duration if possible
embed.add_field(
name="Distance", value=f"{immo_distance} ({immo_duration})", inline=True
)
embed.set_footer(text=immo_data.address)
if immo_distances is not None:
mode_emoji_map = {
"driving": "🚙",
"transit": "🚋",
"bicycling": "🚲",
}
# Add embed for each distance and travel duration type
for mode, (distance, duration) in immo_distances.items():
embed.add_field(
name=f"Distance {mode_emoji_map[mode]}",
value=f"{distance} ({duration})",
inline=True,
)

embed.set_footer(text=immo_data.address, icon_url=immo_data.lister_logo_url or "")
images, files = await _images_viewable_in_embed(immo_data.images, session)

n_images = min(len(images), 4)
files = files[:n_images]

if n_images > 0:
embed.set_image(url=images[0])
# Save first embed
Expand Down
52 changes: 30 additions & 22 deletions app/utils/google_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
)
from math import ceil

import asyncio
from aiohttp import ClientSession


Expand All @@ -22,25 +23,32 @@ async def compute_distance(
base_url = "https://maps.googleapis.com/maps/api/distancematrix/json"
scheme, netloc, path, _, fragment = urlsplit(base_url)

url_params = {
"origins": origin_address,
"destinations": destination_address,
"key": gmaps_api_key
}
query_params = urlencode(url_params)

request_url = urlunsplit((scheme, netloc, path, query_params, fragment))
resp = await session.get(request_url)
if resp.status == 200:
resp_json = await resp.json()
try:
# Get distance in minutes and kilometers
distance = resp_json["rows"][0]["elements"][0]["distance"]["text"]
duration_value = resp_json["rows"][0]["elements"][0]["duration"]["value"]
duration = ceil(duration_value / 60)

return distance, f"{duration} min."
except (KeyError, IndexError):
pass

return None, None
modes = ["driving", "transit", "bicycling"]

results = {}

async def fetch_distance(mode):
url_params = {
"origins": origin_address,
"destinations": destination_address,
"mode": mode,
"key": gmaps_api_key
}
query_params = urlencode(url_params)
request_url = urlunsplit((scheme, netloc, path, query_params, fragment))

async with session.get(request_url) as resp:
if resp.status == 200:
resp_json = await resp.json()
try:
element = resp_json["rows"][0]["elements"][0]
distance = element["distance"]["text"]
duration = element["duration"]["text"]
return mode, (distance, duration)
except (KeyError, IndexError):
return mode, (None, None)
else:
return mode, (None, None)

results = dict(await asyncio.gather(*(fetch_distance(mode) for mode in modes)))
return results
Empty file modified preview.sh
100644 → 100755
Empty file.
1 change: 0 additions & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,5 @@ aiohttp
beautifulsoup4
discord.py
pydantic
pylint
python-dotenv
sentry-sdk
23 changes: 2 additions & 21 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile requirements.in
# pip-compile --output-file=requirements.txt requirements.in
#
aiohttp==3.7.4.post0
# via
# -r requirements.in
# discord-py
astroid==2.9.3
# via pylint
async-timeout==3.0.1
# via aiohttp
attrs==21.4.0
Expand All @@ -24,40 +22,23 @@ discord-py==1.7.3
# via -r requirements.in
idna==3.3
# via yarl
isort==5.10.1
# via pylint
lazy-object-proxy==1.7.1
# via astroid
mccabe==0.6.1
# via pylint
multidict==6.0.2
# via
# aiohttp
# yarl
platformdirs==2.5.1
# via pylint
pydantic==1.10.1
# via -r requirements.in
pylint==2.12.2
# via -r requirements.in
python-dotenv==0.20.0
# via -r requirements.in
sentry-sdk==1.15.0
# via -r requirements.in
soupsieve==2.3.1
# via beautifulsoup4
toml==0.10.2
# via pylint
typing-extensions==4.1.1
# via
# aiohttp
# pydantic
urllib3==1.26.14
# via sentry-sdk
wrapt==1.13.3
# via astroid
yarl==1.7.2
# via aiohttp

# The following packages are considered to be unsafe in a requirements file:
# setuptools

0 comments on commit 7d4d58d

Please sign in to comment.