From 7403e188cad7d42abc761fc4ba254291a5bddca6 Mon Sep 17 00:00:00 2001 From: Ice Lam Date: Sun, 12 Dec 2021 03:42:00 +0800 Subject: [PATCH] fix: rewrite html to markdown converter for support of lists --- app/handler.py | 41 ++++++++++---------------------- app/utils/markdown.py | 54 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 29 deletions(-) create mode 100644 app/utils/markdown.py diff --git a/app/handler.py b/app/handler.py index a57c96f..6d8cca3 100644 --- a/app/handler.py +++ b/app/handler.py @@ -1,10 +1,11 @@ """Lambda function file for sending scheduled message to a connected Telegram chat via Chat ID.""" import os -import re import requests import telegram -from markdownify import markdownify as md + +# AWS Lambda loads handler in a special way so we need to import local modules from 'app' +from app.utils import markdown LEETCODE_DOMAIN = 'https://leetcode.com' LEETCODE_ALL_PROBLEM_URL = LEETCODE_DOMAIN + '/problemset/all/' @@ -60,31 +61,13 @@ def get_question_of_today(): try: return response.json() except ValueError: - print("Failed to decode JSON, API response:") + print('Failed to decode JSON, API response:') print(response.text) raise except BaseException as error: - print(f"Unexpected {error=}, {type(error)=}") + print(f'Unexpected {error=}, {type(error)=}') raise -def generate_telegram_markdown(content): - """Convert HTML to Telegram Markdown syntax""" - - formatted_content = content - # Special handling for superscript and subscript since standard markdown does not support them - formatted_content = re.sub('', '^', formatted_content) - formatted_content = re.sub('', '_', formatted_content) - # Convert allowed tags to markdown - # Note that supported markdown syntax is different in Telegram - # https://core.telegram.org/bots/api#formatting-options - formatted_content = md(formatted_content, convert=['p', 'img', 'code', 'pre']) - # Replace multiple empty lines - formatted_content = re.sub('(\s+)?\n{2,}', '\n\n', formatted_content) - # Special handling for images - formatted_content = re.sub('\!\[(.+)?\]\((.+)\)', r'image: \2', formatted_content) - - return formatted_content.strip() - def send_message(event, context): """Lambda function handler to send text message.""" @@ -100,12 +83,11 @@ def send_message(event, context): question_difficulty = question_info['question']['difficulty'] question_content = question_info['question']['content'] - - message = f"*{question_date}*\n" \ - f"*{question_id}. {question_title}*\n\n" \ - f"*Topic:* {question_topic}\n" \ - f"*Difficulty:* {question_difficulty}\n\n" \ - f"*Problem:*\n{generate_telegram_markdown(question_content)}" + message = f'*{question_date}*\n' \ + f'*{question_id}. {question_title}*\n\n' \ + f'*Topic:* {question_topic}\n' \ + f'*Difficulty:* {question_difficulty}\n\n' \ + f'*Problem:*\n{markdown.generate(question_content)}' bot = telegram.Bot(token=TOKEN) bot.send_message( @@ -114,7 +96,8 @@ def send_message(event, context): reply_markup=telegram.InlineKeyboardMarkup([ [telegram.InlineKeyboardButton(text="View on Leetcode", url=question_url)] ]), - parse_mode='Markdown' + parse_mode='Markdown', + disable_web_page_preview=True ) else: raise Exception('Invalid API response. No "data" node found in API response.') diff --git a/app/utils/markdown.py b/app/utils/markdown.py new file mode 100644 index 0000000..4fe2c27 --- /dev/null +++ b/app/utils/markdown.py @@ -0,0 +1,54 @@ +"""Convert HTML to Telegram markdown syntax""" + +import re +from markdownify import MarkdownConverter, BACKSLASH + +class TelegramMarkdownConverter(MarkdownConverter): + """Create a custom MarkdownConverter that fits Telegram markdown format""" + + def convert_img(self, el, text, convert_as_inline): + src = el.attrs.get('src', None) or '' + return f'Image: [{src}]({src})' + + def convert_br(self, el, text, convert_as_inline): + if convert_as_inline: + return "" + + if self.options['newline_style'].lower() == BACKSLASH: + return '\\\n' + + return '\n' + + def convert_p(self, el, text, convert_as_inline): + if convert_as_inline: + return text + return f'{text}\n\n' if text.strip() else '' + + def convert_pre(self, el, text, convert_as_inline): + if not text: + return '' + after_paragraph = False + + if el.previous_sibling and el.previous_sibling.name in ['p']: + after_paragraph = True + + return ('\n' if not after_paragraph else '') + f"```{self.options['code_language']}\n{text.strip()}\n```\n\n" + + def convert_sub(self, el, text, convert_as_inline): + return f'_{text}' + + def convert_sup(self, el, text, convert_as_inline): + return f'^{text}' + +def generate(html, **options): + """Convert function with options predefined""" + + result = TelegramMarkdownConverter( + **options, + convert=['br', 'p', 'img', 'code', 'pre', 'ul', 'ol', 'li', 'a', 'sup', 'sub'], + bullets='•••' + ).convert(html).strip() + + result = re.sub('\n{2,}', '\n\n', result) + + return result