diff --git a/code/markdown_to_json_parser.py b/code/markdown_to_json_parser.py index f782408..20671a3 100644 --- a/code/markdown_to_json_parser.py +++ b/code/markdown_to_json_parser.py @@ -346,7 +346,30 @@ def parse_paper_links(html): def extract_paper_data(paper_section, columns): title_column = columns[0] # title = title_column.get_text(strip=True) - title = title_column.a.encode_contents().decode("utf-8") + title = ( + title_column.a.encode_contents().decode("utf-8") + if title_column.a is not None + else ( + title_column.encode_contents().decode("utf-8") + if title_column.get_text(strip=True) is not None + else None + ) + ) + + title = re.sub(r"<(?:br\s*/?>|img[^>]*>)", "", title) + title = title.strip() + + html_entities = { + "&": "&", + "<": "<", + ">": ">", + """: '"', + "'": "'", + } + title = re.sub( + r"(&\w+;)", lambda x: html_entities.get(x.group(0), x.group(0)), title + ) + title_link = title_column.find("a") title_page = title_link["href"] if title_link else None