Skip to content

Commit

Permalink
Use Pandoc to render markdown, fallback to plaintext
Browse files Browse the repository at this point in the history
Also used for markdown rendering in console
and copy-to-clipboard
  • Loading branch information
danschwarz committed Nov 26, 2023
1 parent 7443d3e commit 8cb294f
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 247 deletions.
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,15 @@
"wcwidth>=0.1.7",
"urwid>=2.0.0,<3.0",
"tomlkit>=0.10.0,<1.0",
"html2text>=2020.1.16"
],
extras_require={
# Required to display rich text in the TUI
"richtext": [
"urwidgets>=0.1,<0.2"
"urwidgets>=0.1,<0.2",
],
"markdown": [
"pypandoc>=1.12.0,<2.0",
"pypandoc-binary>=1.12.0,<2.0",
],
"dev": [
"coverage",
Expand Down
204 changes: 0 additions & 204 deletions tests/test_console.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,210 +152,6 @@ def test_timeline(mock_get, monkeypatch, capsys):
assert err == ""


@mock.patch('toot.http.get')
def test_timeline_html_content(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
'id': '111111111111111111',
'account': {
'display_name': 'Frank Zappa 🎸',
'acct': 'fz'
},
'created_at': '2017-04-12T15:53:18.174Z',
'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
'reblog': None,
'in_reply_to_id': None,
'media_attachments': [],
}])

console.run_command(app, user, 'timeline', ['--once'])

mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})

out, err = capsys.readouterr()
lines = out.split("\n")
reference = [
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC",
"",
"## HTML Render Test",
"",
" _emphasized_ ",
" _underlined_ ",
" **bold** ",
" ** _bold and italic_** ",
" ~~strikethrough~~ ",
"regular text",
"",
"Code block:",
"",
" ",
" 10 PRINT \"HELLO WORLD\" ",
" 20 GOTO 10 ",
" ",
"> Something blockquoted here. The indentation is maintained as the text line wraps.",
" 1. List item",
" • Nested item",
" • Another nested ",
" 2. Another list item. ",
" 1. Something else nested",
" 2. And a last nested",
"",
"> Blockquote",
"> 1. List in BQ",
"> 2. List item 2 in BQ",
">",
"",
"#hashtag #test ",
"https://a.com text after link",
"",
"ID 111111111111111111 ",
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"",
]

assert len(lines) == len(reference)
for index, line in enumerate(lines):
assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"

assert err == ""


@mock.patch('toot.http.get')
def test_timeline_html_content(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
'id': '111111111111111111',
'account': {
'display_name': 'Frank Zappa 🎸',
'acct': 'fz'
},
'created_at': '2017-04-12T15:53:18.174Z',
'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
'reblog': None,
'in_reply_to_id': None,
'media_attachments': [],
}])

console.run_command(app, user, 'timeline', ['--once'])

mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})

out, err = capsys.readouterr()
lines = out.split("\n")
reference = [
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC",
"",
"## HTML Render Test",
"",
" _emphasized_ ",
" _underlined_ ",
" **bold** ",
" ** _bold and italic_** ",
" ~~strikethrough~~ ",
"regular text",
"",
"Code block:",
"",
" ",
" 10 PRINT \"HELLO WORLD\" ",
" 20 GOTO 10 ",
" ",
"> Something blockquoted here. The indentation is maintained as the text line wraps.",
" 1. List item",
" • Nested item",
" • Another nested ",
" 2. Another list item. ",
" 1. Something else nested",
" 2. And a last nested",
"",
"> Blockquote",
"> 1. List in BQ",
"> 2. List item 2 in BQ",
">",
"",
"#hashtag #test ",
"https://a.com text after link",
"",
"ID 111111111111111111 ",
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"",
]

assert len(lines) == len(reference)
for index, line in enumerate(lines):
assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"

assert err == ""


@mock.patch('toot.http.get')
def test_timeline_html_content(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
'id': '111111111111111111',
'account': {
'display_name': 'Frank Zappa 🎸',
'acct': 'fz'
},
'created_at': '2017-04-12T15:53:18.174Z',
'content': "<h2>HTML Render Test</h2><p><em>emphasized</em><br><u>underlined</u><br><strong>bold</strong><br><strong><em>bold and italic</em></strong><br><del>strikethrough</del><br>regular text</p><p>Code block:</p><pre><code>10 PRINT \"HELLO WORLD\"<br>20 GOTO 10<br></code></pre><blockquote><p>Something blockquoted here. The indentation is maintained as the text line wraps.</p></blockquote><ol><li>List item<ul><li>Nested item</li><li>Another nested </li></ul></li><li>Another list item. <ol><li>Something else nested</li><li>And a last nested</li></ol></li></ol><blockquote><p>Blockquote</p><ol><li>List in BQ</li><li>List item 2 in BQ</li></ol></blockquote><p><a href=\"https://babka.social/tags/hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>hashtag</span></a> <a href=\"https://babka.social/tags/test\" class=\"mention hashtag\" rel=\"tag\">#<span>test</span></a> <br><a href=\"https://a.com\" target=\"_blank\" rel=\"nofollow noopener noreferrer\"><span class=\"invisible\">https://</span><span class=\"\">a.com</span><span class=\"invisible\"></span></a> text after link</p>",
'reblog': None,
'in_reply_to_id': None,
'media_attachments': [],
}])

console.run_command(app, user, 'timeline', ['--once'])

mock_get.assert_called_once_with(app, user, '/api/v1/timelines/home', {'limit': 10})

out, err = capsys.readouterr()
lines = out.split("\n")
reference = [
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"Frank Zappa 🎸 @fz 2017-04-12 15:53 UTC",
"",
"## HTML Render Test",
"",
" _emphasized_ ",
" _underlined_ ",
" **bold** ",
" ** _bold and italic_** ",
" ~~strikethrough~~ ",
"regular text",
"",
"Code block:",
"",
" ",
" 10 PRINT \"HELLO WORLD\" ",
" 20 GOTO 10 ",
" ",
"> Something blockquoted here. The indentation is maintained as the text line wraps.",
" 1. List item",
" • Nested item",
" • Another nested ",
" 2. Another list item. ",
" 1. Something else nested",
" 2. And a last nested",
"",
"> Blockquote",
"> 1. List in BQ",
"> 2. List item 2 in BQ",
">",
"",
"#hashtag #test ",
"https://a.com text after link",
"",
"ID 111111111111111111 ",
"────────────────────────────────────────────────────────────────────────────────────────────────────",
"",
]

assert len(lines) == len(reference)
for index, line in enumerate(lines):
assert line == reference[index], f"Line #{index}: Expected:\n{reference[index]}\nGot:\n{line}"

assert err == ""


@mock.patch('toot.http.get')
def test_timeline_with_re(mock_get, monkeypatch, capsys):
mock_get.return_value = MockResponse([{
Expand Down
17 changes: 3 additions & 14 deletions toot/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
import re
import sys
import textwrap
import html2text

from functools import lru_cache
from toot import settings
from toot.utils import get_text
from toot.richtext import html_to_text
from toot.entities import Account, Instance, Notification, Poll, Status
from toot.wcstring import wc_wrap
from typing import List
Expand Down Expand Up @@ -321,20 +321,9 @@ def print_status(status: Status, width: int = 80):


def print_html(text, width=80):
h2t = html2text.HTML2Text()

h2t.body_width = width
h2t.single_line_break = True
h2t.ignore_links = True
h2t.wrap_links = True
h2t.wrap_list_items = True
h2t.wrap_tables = True
h2t.unicode_snob = True
h2t.ul_item_mark = "\N{bullet}"
markdown = h2t.handle(text).strip()

markdown = "\n".join(html_to_text(text, columns=width, highlight_tags=False))
print_out("")
print_out(highlight_hashtags(markdown))
print_out(markdown)


def print_poll(poll: Poll):
Expand Down
25 changes: 25 additions & 0 deletions toot/richtext/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from toot.tui.utils import highlight_hashtags
from toot.utils import html_to_paragraphs
from toot.wcstring import wc_wrap
from typing import List

try:
# first preference, render markup with pypandoc
from .markdown import html_to_text

except ImportError:
# Fallback to render in plaintext
def html_to_text(html: str, columns=80, highlight_tags=False) -> List:
output = []
first = True
for paragraph in html_to_paragraphs(html):
if not first:
output.append("")
for line in paragraph:
for subline in wc_wrap(line, columns):
if highlight_tags:
output.append(highlight_hashtags(subline))
else:
output.append(subline)
first = False
return output
11 changes: 11 additions & 0 deletions toot/richtext/markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pypandoc import convert_text
from typing import List


def html_to_text(html: str, columns=80, highlight_tags=False) -> List:
return [convert_text(
html,
format="html",
to="gfm-raw_html",
extra_args=["--wrap=auto", f"--columns={columns}"],
)]
12 changes: 4 additions & 8 deletions toot/tui/app.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import logging
import subprocess
import urwid
import html2text

from concurrent.futures import ThreadPoolExecutor

from toot import api, config, __version__, settings
from toot.console import get_default_visibility
from toot.exceptions import ApiError
from toot.richtext import html_to_text
from toot.utils.datetime import parse_datetime

from .compose import StatusComposer
Expand Down Expand Up @@ -656,12 +656,8 @@ def _done(loop):
return self.run_in_thread(_delete, done_callback=_done)

def copy_status(self, status):
h2t = html2text.HTML2Text()
h2t.body_width = 0 # nowrap
h2t.single_line_break = True
h2t.ignore_links = True
h2t.unicode_snob = True
h2t.ul_item_mark = "\N{bullet}"

markdown = "\n".join(html_to_text(status.original.data["content"], columns=1024, highlight_tags=False))

time = parse_datetime(status.original.data['created_at'])
time = time.strftime('%Y-%m-%d %H:%M %Z')
Expand All @@ -671,7 +667,7 @@ def copy_status(self, status):
+ "\n"
+ (status.original.author.account or "")
+ "\n\n"
+ h2t.handle(status.original.data["content"]).strip()
+ markdown
+ "\n\n"
+ f"Created at: {time}")

Expand Down
35 changes: 16 additions & 19 deletions toot/tui/richtext/__init__.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
import urwid
import html2text

from toot.tui.utils import highlight_hashtags
from toot.utils import format_content
from typing import List

try:
# our first preference is to render using urwidgets
from .richtext import html_to_widgets, url_to_widget

except ImportError:
# Fallback if urwidgets are not available
def html_to_widgets(html: str) -> List[urwid.Widget]:
return [
urwid.Text(_format_markdown(html))
]
try:
# second preference, render markup with pypandoc
from .markdown import html_to_widgets, url_to_widget

except ImportError:
# Fallback to render in plaintext

def url_to_widget(url: str):
return urwid.Text(("link", url))
def url_to_widget(url: str):
return urwid.Text(("link", url))

def _format_markdown(html) -> str:
h2t = html2text.HTML2Text()
h2t.single_line_break = True
h2t.ignore_links = True
h2t.wrap_links = False
h2t.wrap_list_items = False
h2t.wrap_tables = False
h2t.unicode_snob = True
h2t.ul_item_mark = "\N{bullet}"
return h2t.handle(html).strip()
def html_to_widgets(html: str) -> List[urwid.Widget]:
return [
urwid.Text(highlight_hashtags(line)) for line in format_content(html)
]
Loading

0 comments on commit 8cb294f

Please sign in to comment.