Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Rairye authored Oct 29, 2021
1 parent b9c31cb commit 54946b2
Showing 1 changed file with 65 additions and 18 deletions.
83 changes: 65 additions & 18 deletions mnl_punct_norm/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,40 +23,87 @@ def is_punct(char):

return ((char.isalpha() or char.isnumeric()) or char.isspace()) == False

def strip_outer_punct(word):
if type(word) != str:
return word

i = 0

while i < len(word):
if is_punct(word[i]):
i+=1
else:
break

if i > 0:
word = word[i:]

last_char_index = len(word) -1
j = last_char_index

while j >=0:
if is_punct(word[j]):
j-=1
else:
break

if j < last_char_index:
return word[:j+1]

return word


def get_category(char):
if char.isspace():
return "SPACE"

return "PUNCT" if is_punct(char) else "NOTPUNCT"

def normalize_punct(input_str, mode, input_skips = "", replacement = " "):
result = ""
result = []
skips = [] if type(input_skips) != str else set([char for char in input_skips])
replacement = replacement if type(replacement) == str else " "
last_space = None
last_replacement = None
last_char = ""
last_category = ""
string_len = len(input_str)

for i in range(string_len):
current_char = input_str[i]
current_category = get_category(current_char)

if current_category == "PUNCT" and current_char not in skips:
if mode == "REPLACE":
if (last_char != replacement and last_category != "SPACE") or last_replacement == None:
result.append((input_str[0 if last_replacement == None else last_replacement + 1: i] + replacement))

else:
result.append(input_str[0 if last_replacement == None else last_replacement + 1: i])

for i in range(len(input_str)):
char = input_str[i]
current_category = get_category(char)

if current_category == "PUNCT" and char not in skips:
if mode == "REPLACE" and ((last_space == None and last_replacement == None) or ((last_space != i-1 and input_str[i-1] != replacement) and (last_replacement != i-1 and not result.endswith(replacement)))):
result+=replacement
last_replacement = i
last_space = i
last_char = replacement
last_category = "REPLACEMENT"
continue

else:
if last_replacement == None or (last_replacement != None and i - last_replacement > 1):
result.append(input_str[0 if last_replacement == None else last_replacement+1:i])
last_replacement = i

continue

if current_category == "SPACE" and mode == "REPLACE":
last_space = i

if result.endswith(char) and (last_replacement == i-1):
continue
if last_category == "REPLACEMENT":
last_replacement = i

result+=char
last_char = current_char
last_category = current_category

if last_replacement == None:
return input_str

return result
if last_replacement < string_len - 1:
result.append(input_str[last_replacement+1:])

return "".join(result)

def strip_punct(input_str, input_skips = ""):
if type(input_str) != str:
Expand Down

0 comments on commit 54946b2

Please sign in to comment.