forked from PromtEngineer/localGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxlxs_loader.py
102 lines (89 loc) · 3.83 KB
/
xlxs_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import csv
import tempfile
from typing import Dict, List, Optional
import openpyxl
from langchain.docstore.document import Document
from langchain.document_loaders import CSVLoader
from langchain.document_loaders.base import BaseLoader
# https://learn.microsoft.com/en-us/deployoffice/compat/office-file-format-reference
def xlsx_to_csv(file_path: str, sheet_name: str = None) -> list[str]:
"""
Convert a workbook into a list of csv files
:param file_path: the path to the workbook
:param sheet_name: the name of the sheet to convert
:return: a list of temporary file names
"""
# Load the workbook and select the active worksheet
wb = openpyxl.load_workbook(file_path)
# ws = wb.active
#
# # Create a new temporary file and write the contents of the worksheet to it
# with tempfile.NamedTemporaryFile(mode='w+', newline='', delete=False) as f:
# c = csv.writer(f)
# for r in ws.rows:
# c.writerow([cell.value for cell in r])
# return f.name
# load all sheets if sheet_name is None
wb = wb if sheet_name is None else [wb[sheet_name]]
temp_file_name = []
# Iterate over the worksheets in the workbook
for ws in wb:
# Create a new temporary file and write the contents of the worksheet to it
with tempfile.NamedTemporaryFile(mode='w+', newline='', suffix='.csv', delete=False) as f:
c = csv.writer(f)
for r in ws.rows:
c.writerow([cell.value for cell in r])
temp_file_name.append(f.name)
# print(f'all Sheets are saved to temporary file {temp_file_name}')
return temp_file_name
class XLSXLoader(BaseLoader):
"""Loads an XLSX file into a list of documents.
Each document represents one row of the CSV file converted from the XLSX file.
Every row is converted into a key/value pair and outputted to a new line in the
document's page_content.
The source for each document loaded from csv is set to the value of the
`file_path` argument for all doucments by default.
You can override this by setting the `source_column` argument to the
name of a column in the CSV file.
The source of each document will then be set to the value of the column
with the name specified in `source_column`.
Output Example:
.. code-block:: txt
column1: value1
column2: value2
column3: value3
"""
def __init__(
self,
file_path: str,
source_column: Optional[str] = None,
csv_args: Optional[Dict] = None,
encoding: Optional[str] = None,
):
self.file_path = file_path
self.source_column = source_column
self.encoding = encoding
self.csv_args = csv_args or {}
def load(self) -> List[Document]:
"""Load data into document objects."""
docs = []
csv_files = xlsx_to_csv(self.file_path)
for csv_file in csv_files:
with open(csv_file, newline="", encoding=self.encoding) as csvfile:
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
for i, row in enumerate(csv_reader):
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
try:
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
)
except KeyError:
raise ValueError(
f"Source column '{self.source_column}' not found in CSV file."
)
metadata = {"source": source, "row": i}
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
return docs