-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(api): add markdown support (#106)
* feat(api): add markdown support * fix: unit test markdown upload --------- Co-authored-by: Julien Bouquillon <[email protected]> Co-authored-by: leoguillaume <[email protected]>
- Loading branch information
1 parent
4f99763
commit c40cf9c
Showing
10 changed files
with
144 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,8 @@ | ||
from ._htmlparser import HTMLParser | ||
from ._jsonparser import JSONParser | ||
from ._pdfparser import PDFParser | ||
from ._mdparser import MarkdownParser | ||
from ._baseparser import BaseParser | ||
|
||
|
||
__all__ = ["HTMLParser", "JSONParser", "PDFParser", "BaseParser"] | ||
__all__ = ["HTMLParser", "JSONParser", "PDFParser", "MarkdownParser", "BaseParser"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import re | ||
import time | ||
from typing import List, Optional, Tuple | ||
import uuid | ||
|
||
from bs4 import BeautifulSoup | ||
from fastapi import UploadFile | ||
|
||
from app.schemas.data import ParserOutput, ParserOutputMetadata | ||
|
||
from . import HTMLParser | ||
from ._baseparser import BaseParser | ||
|
||
|
||
class MarkdownParser(BaseParser): | ||
def __init__(self, *args, **kwargs) -> None: | ||
super().__init__(*args, **kwargs) | ||
|
||
def parse(self, file: UploadFile) -> List[ParserOutput]: | ||
""" | ||
Parse a Markdown file and converts it into a list of chunk objects. | ||
Args: | ||
file (UploadFile): Markdown file to parse. | ||
Returns: | ||
List[ParserOutput]: List of parsed outputs. | ||
""" | ||
|
||
markdown_text = file.file.read().decode(encoding="utf-8") | ||
|
||
markdown_tups: List[Tuple[Optional[str], str]] = [] | ||
lines = markdown_text.split("\n") | ||
|
||
title = None | ||
current_header = None | ||
current_lines = [] | ||
in_code_block = False | ||
|
||
for line in lines: | ||
if line.startswith("```"): | ||
# This is the end of a code block if we are already in it, and vice versa. | ||
in_code_block = not in_code_block | ||
|
||
header_match = re.match(pattern=r"^#+\s", string=line) | ||
if not in_code_block and header_match: | ||
# Upon first header, skip if current text chunk is empty | ||
if current_header is not None or len(current_lines) > 0: | ||
markdown_tups.append((current_header, "\n".join(current_lines))) | ||
if not title: | ||
title = line | ||
current_header = line | ||
current_lines.clear() | ||
else: | ||
current_lines.append(line) | ||
|
||
# Append final text chunk | ||
if current_lines: | ||
markdown_tups.append((current_header, "\n".join(current_lines))) | ||
|
||
extracted_text = [f"${title}:\n${content}".format({title, content}) for (title, content) in markdown_tups] | ||
|
||
content = self.clean("\n".join(extracted_text).strip()) | ||
|
||
name = file.filename.strip() | ||
|
||
metadata = ParserOutputMetadata( | ||
collection_id=self.collection_id, document_id=str(uuid.uuid4()), document_name=name, document_created_at=round(time.time()), title=title | ||
) | ||
|
||
output = [ParserOutput(content=content, metadata=metadata)] | ||
|
||
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# Dauphinois de potimarron - recette végétarienne cuisine-libre.org | ||
|
||
## Description | ||
|
||
Préparation : 20 min Cuisson : 1 h [Four](https://www.cuisine-libre.org/four) | ||
|
||
![Végétarien](https://www.cuisine-libre.org/local/cache-vignettes/L40xH40/moton18-9d595.png?1644794211 "Végétarien") | ||
|
||
Sans viande Sans œuf | ||
|
||
Dauphinois de potimarron Rated 5.00 out of 5 based on 2 ratings. | ||
|
||
![](https://www.cuisine-libre.org/local/cache-gd2/6d/60311ebc0c8cb1dfbbe3e5cf92e9fd.jpg?1675005547) | ||
|
||
![Appétissante photo DR](https://www.cuisine-libre.org/local/cache-gd2/c6/f3d3dd24ed5a690a2e6ad481f8a95c.jpg?1675005547) | ||
|
||
## Ingrédients pour 4 | ||
|
||
- potimarron de 1 kg (ou plus) | ||
- crème fraiche liquide (fleurette) | ||
- ail | ||
- beurre | ||
- sel, poivre | ||
|
||
## Préparation | ||
|
||
Préchauffer le four à 180/200°C. | ||
|
||
Couper le potimarron en « taillons » de quelques millimètres d’épaisseur. | ||
Frotter d’une gousse d’ail épluchée un plat à four en terre. Y répartir les « taillons » en couches, saler et poivrer entre chaque couche. Verser la crème, qui doit juste couvrir le potimarron (jusqu’à un litre en fonction de la taille du plat). Parsemer de quelques noisettes de beurre, pour le gratiné final. | ||
|
||
Cuire une heure environ, forcer à 220°C les dix dernières minutes. | ||
|
||
## <:info_post_scriptum:> | ||
|
||
Un plat qui se réchauffe plus facilement que le vrai [gratin dauphinois](https://www.cuisine-libre.org/gratin-dauphinois) (aux pommes de terre). Cette recette m’a été proposée par ma productrice de légumes préférée : c’est la recette du gratin dauphinois appliquée à la courge. Elle convient au potimarron et aux variétés de citrouilles fermes. | ||
|
||
- [![](https://www.cuisine-libre.org/local/cache-gd2/13/f50d96b2f12916e2df6b65f1bd381c.jpg?1644794690)Potimarron](https://www.cuisine-libre.org/potimarron) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters