Skip to content

Commit

Permalink
fix: rewrite html to markdown converter for support of lists
Browse files Browse the repository at this point in the history
  • Loading branch information
icelam committed Dec 11, 2021
1 parent 919e908 commit 7403e18
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 29 deletions.
41 changes: 12 additions & 29 deletions app/handler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""Lambda function file for sending scheduled message to a connected Telegram chat via Chat ID."""

import os
import re
import requests
import telegram
from markdownify import markdownify as md

# AWS Lambda loads handler in a special way so we need to import local modules from 'app'
from app.utils import markdown

LEETCODE_DOMAIN = 'https://leetcode.com'
LEETCODE_ALL_PROBLEM_URL = LEETCODE_DOMAIN + '/problemset/all/'
Expand Down Expand Up @@ -60,31 +61,13 @@ def get_question_of_today():
try:
return response.json()
except ValueError:
print("Failed to decode JSON, API response:")
print('Failed to decode JSON, API response:')
print(response.text)
raise
except BaseException as error:
print(f"Unexpected {error=}, {type(error)=}")
print(f'Unexpected {error=}, {type(error)=}')
raise

def generate_telegram_markdown(content):
"""Convert HTML to Telegram Markdown syntax"""

formatted_content = content
# Special handling for superscript and subscript since standard markdown does not support them
formatted_content = re.sub('<sup>', '<sup>^', formatted_content)
formatted_content = re.sub('<sub>', '<sub>_', formatted_content)
# Convert allowed tags to markdown
# Note that supported markdown syntax is different in Telegram
# https://core.telegram.org/bots/api#formatting-options
formatted_content = md(formatted_content, convert=['p', 'img', 'code', 'pre'])
# Replace multiple empty lines
formatted_content = re.sub('(\s+)?\n{2,}', '\n\n', formatted_content)
# Special handling for images
formatted_content = re.sub('\!\[(.+)?\]\((.+)\)', r'image: \2', formatted_content)

return formatted_content.strip()

def send_message(event, context):
"""Lambda function handler to send text message."""

Expand All @@ -100,12 +83,11 @@ def send_message(event, context):
question_difficulty = question_info['question']['difficulty']
question_content = question_info['question']['content']


message = f"*{question_date}*\n" \
f"*{question_id}. {question_title}*\n\n" \
f"*Topic:* {question_topic}\n" \
f"*Difficulty:* {question_difficulty}\n\n" \
f"*Problem:*\n{generate_telegram_markdown(question_content)}"
message = f'*{question_date}*\n' \
f'*{question_id}. {question_title}*\n\n' \
f'*Topic:* {question_topic}\n' \
f'*Difficulty:* {question_difficulty}\n\n' \
f'*Problem:*\n{markdown.generate(question_content)}'

bot = telegram.Bot(token=TOKEN)
bot.send_message(
Expand All @@ -114,7 +96,8 @@ def send_message(event, context):
reply_markup=telegram.InlineKeyboardMarkup([
[telegram.InlineKeyboardButton(text="View on Leetcode", url=question_url)]
]),
parse_mode='Markdown'
parse_mode='Markdown',
disable_web_page_preview=True
)
else:
raise Exception('Invalid API response. No "data" node found in API response.')
54 changes: 54 additions & 0 deletions app/utils/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""Convert HTML to Telegram markdown syntax"""

import re
from markdownify import MarkdownConverter, BACKSLASH

class TelegramMarkdownConverter(MarkdownConverter):
"""Create a custom MarkdownConverter that fits Telegram markdown format"""

def convert_img(self, el, text, convert_as_inline):
src = el.attrs.get('src', None) or ''
return f'Image: [{src}]({src})'

def convert_br(self, el, text, convert_as_inline):
if convert_as_inline:
return ""

if self.options['newline_style'].lower() == BACKSLASH:
return '\\\n'

return '\n'

def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return text
return f'{text}\n\n' if text.strip() else ''

def convert_pre(self, el, text, convert_as_inline):
if not text:
return ''
after_paragraph = False

if el.previous_sibling and el.previous_sibling.name in ['p']:
after_paragraph = True

return ('\n' if not after_paragraph else '') + f"```{self.options['code_language']}\n{text.strip()}\n```\n\n"

def convert_sub(self, el, text, convert_as_inline):
return f'_{text}'

def convert_sup(self, el, text, convert_as_inline):
return f'^{text}'

def generate(html, **options):
"""Convert function with options predefined"""

result = TelegramMarkdownConverter(
**options,
convert=['br', 'p', 'img', 'code', 'pre', 'ul', 'ol', 'li', 'a', 'sup', 'sub'],
bullets='•••'
).convert(html).strip()

result = re.sub('\n{2,}', '\n\n', result)

return result

0 comments on commit 7403e18

Please sign in to comment.