From 7403e188cad7d42abc761fc4ba254291a5bddca6 Mon Sep 17 00:00:00 2001
From: Ice Lam <pinky.yy.lam@gmail.com>
Date: Sun, 12 Dec 2021 03:42:00 +0800
Subject: [PATCH] fix: rewrite html to markdown converter for support of lists

---
 app/handler.py        | 41 ++++++++++----------------------
 app/utils/markdown.py | 54 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 29 deletions(-)
 create mode 100644 app/utils/markdown.py
diff --git a/app/handler.py b/app/handler.py
index a57c96f..6d8cca3 100644
--- a/app/handler.py
+++ b/app/handler.py
@@ -1,10 +1,11 @@
 """Lambda function file for sending scheduled message to a connected Telegram chat via Chat ID."""
 
 import os
-import re
 import requests
 import telegram
-from markdownify import markdownify as md
+
+# AWS Lambda loads handler in a special way so we need to import local modules from 'app'
+from app.utils import markdown
 
 LEETCODE_DOMAIN = 'https://leetcode.com'
 LEETCODE_ALL_PROBLEM_URL = LEETCODE_DOMAIN + '/problemset/all/'
@@ -60,31 +61,13 @@ def get_question_of_today():
     try:
         return response.json()
     except ValueError:
-        print("Failed to decode JSON, API response:")
+        print('Failed to decode JSON, API response:')
         print(response.text)
         raise
     except BaseException as error:
-        print(f"Unexpected {error=}, {type(error)=}")
+        print(f'Unexpected {error=}, {type(error)=}')
         raise
 
-def generate_telegram_markdown(content):
-    """Convert HTML to Telegram Markdown syntax"""
-
-    formatted_content = content
-    # Special handling for superscript and subscript since standard markdown does not support them
-    formatted_content = re.sub('<sup>', '<sup>^', formatted_content)
-    formatted_content = re.sub('<sub>', '<sub>_', formatted_content)
-    # Convert allowed tags to markdown
-    # Note that supported markdown syntax is different in Telegram
-    # https://core.telegram.org/bots/api#formatting-options
-    formatted_content = md(formatted_content, convert=['p', 'img', 'code', 'pre'])
-    # Replace multiple empty lines
-    formatted_content = re.sub('(\s+)?\n{2,}', '\n\n', formatted_content)
-    # Special handling for images
-    formatted_content = re.sub('\!\[(.+)?\]\((.+)\)', r'image: \2', formatted_content)
-
-    return formatted_content.strip()
-
 def send_message(event, context):
     """Lambda function handler to send text message."""
 
@@ -100,12 +83,11 @@ def send_message(event, context):
         question_difficulty = question_info['question']['difficulty']
         question_content = question_info['question']['content']
 
-
-        message = f"*{question_date}*\n" \
-        f"*{question_id}. {question_title}*\n\n" \
-        f"*Topic:* {question_topic}\n" \
-        f"*Difficulty:* {question_difficulty}\n\n" \
-        f"*Problem:*\n{generate_telegram_markdown(question_content)}"
+        message = f'*{question_date}*\n' \
+        f'*{question_id}. {question_title}*\n\n' \
+        f'*Topic:* {question_topic}\n' \
+        f'*Difficulty:* {question_difficulty}\n\n' \
+        f'*Problem:*\n{markdown.generate(question_content)}'
 
         bot = telegram.Bot(token=TOKEN)
         bot.send_message(
@@ -114,7 +96,8 @@ def send_message(event, context):
             reply_markup=telegram.InlineKeyboardMarkup([
                 [telegram.InlineKeyboardButton(text="View on Leetcode", url=question_url)]
             ]),
-            parse_mode='Markdown'
+            parse_mode='Markdown',
+            disable_web_page_preview=True
         )
     else:
         raise Exception('Invalid API response. No "data" node found in API response.')
diff --git a/app/utils/markdown.py b/app/utils/markdown.py
new file mode 100644
index 0000000..4fe2c27
--- /dev/null
+++ b/app/utils/markdown.py
@@ -0,0 +1,54 @@
+"""Convert HTML to Telegram markdown syntax"""
+
+import re
+from markdownify import MarkdownConverter, BACKSLASH
+
+class TelegramMarkdownConverter(MarkdownConverter):
+    """Create a custom MarkdownConverter that fits Telegram markdown format"""
+
+    def convert_img(self, el, text, convert_as_inline):
+        src = el.attrs.get('src', None) or ''
+        return f'Image: [{src}]({src})'
+
+    def convert_br(self, el, text, convert_as_inline):
+        if convert_as_inline:
+            return ""
+
+        if self.options['newline_style'].lower() == BACKSLASH:
+            return '\\\n'
+
+        return '\n'
+
+    def convert_p(self, el, text, convert_as_inline):
+        if convert_as_inline:
+            return text
+        return f'{text}\n\n' if text.strip() else ''
+
+    def convert_pre(self, el, text, convert_as_inline):
+        if not text:
+            return ''
+        after_paragraph = False
+
+        if el.previous_sibling and el.previous_sibling.name in ['p']:
+            after_paragraph = True
+
+        return ('\n' if not after_paragraph else '') + f"```{self.options['code_language']}\n{text.strip()}\n```\n\n"
+
+    def convert_sub(self, el, text, convert_as_inline):
+        return f'_{text}'
+
+    def convert_sup(self, el, text, convert_as_inline):
+        return f'^{text}'
+
+def generate(html, **options):
+    """Convert function with options predefined"""
+
+    result = TelegramMarkdownConverter(
+        **options,
+        convert=['br', 'p', 'img', 'code', 'pre', 'ul', 'ol', 'li', 'a', 'sup', 'sub'],
+        bullets='•••'
+    ).convert(html).strip()
+
+    result = re.sub('\n{2,}', '\n\n', result)
+
+    return result