-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnounmatching.py
102 lines (79 loc) · 3.26 KB
/
nounmatching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords', download_dir='/workspace/app/data/nltk_data')
nltk.download('punkt', download_dir='/workspace/app/data/nltk_data')
nltk.data.path.append("/workspace/app/data/nltk_data")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer
import io
import spacy
#spacy.cli.download("en_core_web_trf", "/workspace/app/data/spacy/")
#spacy.load("en_core_web_sm")
#spacy.load("en_core_web_trf", "/workspace/app/data/spacy/")
nlp = spacy.load("en_core_web_trf")
#spacy.load( "/workspace/app/data/spacy//en_core_web_trf")
#spacy.load("en_core_web_trf") #, "/workspace/app/data/spacy/")
JD_FILE = "/workspace/app/code/jd.txt"
RESUME_FILE = "/workspace/app/code/resume.txt"
unwanted_chars = ['\n', '\n\n', '\n\n\n', '\t','\t\t', '\t\t\t']
def load_file(name=JD_FILE):
content_list = []
with io.open(name, 'rt') as f:
content_list = f.readlines()
return content_list
def extract_keywords(input_list):
#print(len(input_list) , input_list[:5])
text_all = []
sentence_wo_sw = ""
for line in input_list:
line_ = line.split(' ')
tokens_without_ = [word for word in line_ if not word in stopwords.words() ]
tokens_without_sw = [word for word in tokens_without_ if not word in unwanted_chars ]
# print(tokens_without_sw)
sentence_wo_sw = ' '.join(tokens_without_sw)
text_all.append(sentence_wo_sw)
full_sentence = ' '.join(text_all)
# print("*==="*50)
# print(sentence_wo_sw)
# print("*==="*50)
n_gram_range = (1, 1)
stop_words = "english"
# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([full_sentence])
candidates = count.get_feature_names()
#print(candidates)
return candidates
def missing_keywords(key_jd, key_resume):
missing_in_resume = [word for word in key_jd if not word in key_resume]
missing_percent = 100*len(missing_in_resume)/len(key_jd)
return missing_percent, missing_in_resume
def compare_documents():
# key_jd = extract_keywords(JD_FILE)
# key_resume = extract_keywords(RESUME_FILE)
#nlp = spacy.load("en_core_web_trf")
input_jd = ' '.join(load_file(JD_FILE))
doc_jd = nlp(input_jd)
noun_phrases_jd = set(chunk.text.strip().lower() for chunk in doc_jd.noun_chunks)
nouns_jd = set()
for token in doc_jd:
if token.pos_ == "NOUN":
nouns_jd.add(token.text)
print("#nouns_jd"*10, '\n', nouns_jd, '\n', "#"*100)
input_resume = ' '.join(load_file(RESUME_FILE))
doc_resume = nlp(input_resume)
noun_phrases_resume = set(chunk.text.strip().lower()
for chunk in doc_resume.noun_chunks)
nouns_resume = set()
for token in doc_resume:
if token.pos_ == "NOUN":
nouns_resume.add(token.text)
print("#nouns_resume"*10, '\n', nouns_resume, '\n', "#"*100)
print(missing_keywords(nouns_jd, nouns_resume))
#print(compare_nouns(nouns_jd, nouns_resume))
compare_documents()