-
Notifications
You must be signed in to change notification settings - Fork 2
/
parser.py
143 lines (125 loc) · 4.68 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
from google.cloud import dialogflow_v2 as dialogflow
import json
import logging
import warnings
import os
import io
# Import additional packages
from transformers import pipeline
from PyPDF2 import PdfReader
from docx import Document
import re
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)
app = Flask(__name__)
CORS(app)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "redact-names-icmq-29edfc30b1f7.json"
client = dialogflow.SessionsClient()
project_id = 'redact-names-icmq'
nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", tokenizer="dbmdz/bert-large-cased-finetuned-conll03-english")
def pdfxtract(file_path):
reader = PdfReader(file_path)
text = [page.extract_text() for page in reader.pages]
return "\n".join(text)
def docxtract(file_path):
doc = Document(file_path)
text = [p.text for p in doc.paragraphs]
return "\n".join(text)
def extract_text(file_path):
ext = os.path.splitext(file_path)[1]
if ext == '.pdf':
return pdfxtract(file_path)
elif ext == '.docx':
return docxtract(file_path)
else:
return 'Unsupported file format'
def extractInfo(text):
doc = nlp(text)
names = []
current_name = []
for entity in doc:
if entity['entity'] in ['B-PER', 'I-PER']:
if entity['word'].startswith("##"):
current_name.append(entity['word'][2:])
else:
if current_name:
names.append("".join(current_name))
current_name = []
current_name.append(entity['word'])
else:
if current_name:
names.append("".join(current_name))
current_name = []
if current_name:
names.append("".join(current_name))
phones = re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
addresses = re.findall(r'\d{1,5} \w+ (Street|St|Avenue|Ave|Boulevard|Blvd|Road|Rd|Lane|Ln|Block|Sector)\b', text)
return {
'names': names,
'phones': phones,
'addresses': addresses
}
def redact(text, info):
for name in info['names']:
text = text.replace(name, '[REDACTED]')
for phone in info['phones']:
text = text.replace(phone, '[REDACTED]')
for address in info['addresses']:
text = text.replace(address, '[REDACTED]')
return text
def redactedDocx(text, output_path):
doc = Document()
for line in text.split('\n'):
doc.add_paragraph(line)
doc.save(output_path)
def redactedPdf(text, output_path):
c = canvas.Canvas(output_path, pagesize=letter)
text_object = c.beginText(40, 750) # Starting position
lines = text.split('\n')
for line in lines:
text_object.textLine(line)
c.drawText(text_object)
c.save()
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({'message': 'No file part'})
file = request.files['file']
if file.filename == '':
return jsonify({'message': 'No selected file'})
if file:
file_path = "temp_file" + os.path.splitext(file.filename)[1]
file.save(file_path)
entity_to_redact = request.form.get('entity-type', 'names')
try:
text = extract_text(file_path)
info = extractInfo(text)
if entity_to_redact == 'names':
info = {'names': info['names'], 'phones': [], 'addresses': []}
elif entity_to_redact == 'phones':
info = {'names': [], 'phones': info['phones'], 'addresses': []}
elif entity_to_redact == 'addresses':
info = {'names': [], 'phones': [], 'addresses': info['addresses']}
else:
pass
redacted = redact(text, info)
output_path = "redacted_output.docx"
if file_path.endswith('.docx'):
redactedDocx(redacted, output_path)
elif file_path.endswith('.pdf'):
output_path = "redacted_output.pdf"
redactedPdf(redacted, output_path)
else:
return jsonify({'message': 'Unsupported file format'})
return send_file(output_path, as_attachment=True)
except Exception as e:
return jsonify({'message': f"Error: {str(e)}"})
if __name__ == '__main__':
app.run(debug=True)