Skip to content


Merge pull request #11 from patvdleer/master
Browse files Browse the repository at this point in the history
PY3 fixes
  • Loading branch information
hopple authored Sep 20, 2017
2 parents a0f7119 + 79288c9 commit 904b905
Show file tree
Hide file tree
Showing 176 changed files with 120,339 additions and 11 deletions.
2 changes: 2 additions & 0 deletions build/lib.linux-x86_64-2.7/sematch/
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__author__ = 'Ganggao Zhu'

349 changes: 349 additions & 0 deletions build/lib.linux-x86_64-2.7/sematch/
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Ganggao Zhu- Grupo de Sistemas Inteligentes
# gzhu[at]
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

from sematch.semantic.sparql import NameSPARQL, QueryGraph
from sematch.semantic.similarity import YagoTypeSimilarity
from sematch.utility import memoized
from sematch.nlp import word_tokenize, word_process, Extraction

import numpy as np
import itertools
from collections import Counter

class Matcher:

"""This class is used for concept based entity match in DBpedia"""

def __init__(self, result_limit=5000, expansion=True, show_query=False):
self._expansion = expansion
self._show_query = show_query
self._linker = NameSPARQL()
self._extracter = Extraction()
self._yago = YagoTypeSimilarity()
self._query_graph = QueryGraph(result_limit)

def type_links(self, word, lang='eng'):
synsets = self._yago.multilingual2synset(word, lang=lang)
if self._expansion:
synsets = list(set(itertools.chain.from_iterable([self._yago.synset_expand(s) for s in synsets])))
links = []
for s in synsets:
link_dic = {}
link_dic['name'] =
link_dic['gloss'] = s._definition
link_dic['lemma'] = ' '.join(s._lemma_names)
concept_link = []
yago_link = self._yago.synset2yago(s)
dbpedia_link = self._yago.synset2dbpedia(s)
concept_link.append(yago_link) if yago_link else None
concept_link.append(dbpedia_link) if dbpedia_link else None
link_dic['lod'] = concept_link
if link_dic['lod']:
return links

def query_process(self, query):
Process query into concept (common noun) and entity (proper noun). Link them
to Knowledge Graph uri links respectively.
:param query: short text query
:return: tuple of concepts and entities in uris.
entities = self._extracter.extract_chunks_sent(query)
entity_filter = list(itertools.chain.from_iterable([e.lower().split() for e in entities]))
entity_filter = set(entity_filter)
concepts = list(set(self._extracter.extract_nouns(query)))
concepts = [c for c in concepts if c not in entity_filter]
concept_uris = [list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(c)])) for c in concepts]
concept_uris = list(itertools.chain.from_iterable(concept_uris))
entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities)))
return list(set(concept_uris)), list(set(entity_uris))

def match_concepts(self, concepts, lang='en'):
results = []
for i in xrange(0, len(concepts), 5):
results.extend(self._query_graph.type_query(concepts[i:i + 5], lang, self._show_query))
result_dic = {}
for res in results:
if res['uri'] not in result_dic:
result_dic[res['uri']] = res
return [result_dic[key] for key in result_dic.keys()]

def match_type(self, query, lang='eng'):
lang_map = {'eng':'en','spa':'es', 'cmn':'zh'}
result_lang = lang_map[lang]
words = query.split()
concept_uris = []
for w in words:
concepts = list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(w,lang)]))
concept_uris = list(set(concept_uris))
return self.match_concepts(concept_uris, result_lang)

def match_entity_type(self, query):
results = []
concepts, entities = self.query_process(query)
for e in entities:
for i in xrange(0, len(concepts), 5):
results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e, self._show_query))
result_dic = {}
for res in results:
if res['uri'] not in result_dic:
result_dic[res['uri']] = res
result = [result_dic[key] for key in result_dic.keys()]
return result

class SimClassifier:
This class implements similarity based category classifiers.

def __init__(self, labels, cat_features, feature_weights, sim_metric, sim_model='weighted'):
Class initialization.
:param labels: predefined categories
:param cat_features: features to represent each category
:param sim_metric: word similarity function
self._categories = labels
self._cat_features = cat_features
self._feature_weights = feature_weights
self._sim_metric = sim_metric
self._sim_model = self.pick_sim_model(sim_model)

def pick_sim_model(self, sim_model):
weighted = lambda x, y: self.weighted_similarity(x, y)
max_sim = lambda x, y: self.max_similarity(x, y)
average = lambda x, y: self.average_similarity(x, y)
model_dic = {'weighted':weighted, 'max':max_sim, 'average':average}
return model_dic[sim_model]

def train(cls, corpus, sim_metric, feature_num=5, sim_model='weighted'):
Extract categories, features, feature weights, from corpus.
Compute the weight for each feature token in each category
The weight is computed as token_count / total_feature_count
print "Training..."
cat_word = {}
for sent, cat in corpus:
cat_word.setdefault(cat, []).extend(word_process(word_tokenize(sent)))
features = {cat: Counter(cat_word[cat]) for cat in cat_word}
labels = features.keys()
cat_features = {}
feature_weights = {}
for c, f in features.iteritems():
w_c_pairs = f.most_common(feature_num)
words, counts = zip(*w_c_pairs)
cat_features[c] = words
total_count = float(sum(counts))
word_weights = []
for w, count in w_c_pairs:
word_weights.append((w, count / total_count))
feature_weights[c] = word_weights
return cls(labels, cat_features, feature_weights, sim_metric, sim_model)

def weighted_similarity(self, word, category):
Input word is compared to each feature word using semantic similarity. The whole similarity
score is computed as weighted sum.
:param word: feature word
:param category: a predefined category
:return: weighted word similarity score between word and category
features, weights = zip(*self._feature_weights[category])
scores = map(lambda x: self._sim_metric(word, x), features)
return, np.array(weights).transpose())

def max_similarity(self, word, category):
Compute similarity between word and category, where
category is represented by several feature words
:param word: feature word
:param category: a predefined category
:return: max word similarity score between word and category
return max(map(lambda x: self._sim_metric(word, x), self._cat_features[category]) + [0.0])

def average_similarity(self, word, category):
Compute similarity between word and category, where
category is represented by several feature words
:param word: feature word
:param category: a predefined category
:return: average word similarity score between word and category
sum_score = sum(map(lambda x: self._sim_metric(word, x), self._cat_features[category]) + [0.0])
N = len(self._cat_features[category])
return sum_score / N

def category_similarity(self, word, category):
Compute the semantic similarity between a word and a category.
:param word: a feature word
:param category: predefined category
:param method: the name of semantic similarity metric
:return: similarity score between word and category
return self._sim_model(word, category)

def classify_single(self, sent, feature_model='max'):
The input feature words are compared to each category based on category similarity.
Sum the semantic similarity score between features and category.
The category having highest similarity score is the correct category.
:param featuresets: feature sets such as word list
:param method: specify the semantic similarity metric
:param model: similarity combination model 'max', 'sum'. Default is 'max'
:return: the correct category label.
feature_words = list(set(word_process(word_tokenize(sent))))
score = {}
for c in self._categories:
if feature_model == 'max':
score[c] = max([self.category_similarity(w, c) for w in feature_words] + [0.0])
score[c] = sum([self.category_similarity(w, c) for w in feature_words] + [0.0])
return Counter(score).most_common(1)[0][0]

def classify(self, X, feature_model='max'):
return [self.classify_single(x, feature_model) for x in X]

from sklearn.svm import LinearSVC
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import DictVectorizer

import time

def timeit(func):

def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
delta = time.time() - start
return result, delta
return wrapper

class TextPreprocessor(BaseEstimator, TransformerMixin):
Transform input text into feature representation
def __init__(self, corpus, word_sim_metric, feature_num=10, model='sim'):
:param corpus: use a corpus to train a vector representation
:param feature_num: number of dimensions
:param model: onehot or sim
self._model = model
self._word_sim = word_sim_metric
self._features = self.extract_features(corpus, feature_num)

def fit(self, X, y=None):
return self

def inverse_transform(self, X):
return X

def extract_features(self, corpus, feature_num=10):
cat_word = {}
for sent, cat in corpus:
cat_word.setdefault(cat, []).extend(word_process(word_tokenize(sent)))
features = {cat: Counter(cat_word[cat]) for cat in cat_word}
feature_words = []
for c, f in features.iteritems():
words, counts = zip(*f.most_common(feature_num))
feature_words = set(feature_words)
return feature_words

def similarity(self, tokens, feature):
sim = lambda x: self._word_sim(feature, x)
return max(map(sim, tokens) + [0.0])

def unigram_features(self, tokens):
words = set(tokens)
features = {}
for f in self._features:
features['contains({})'.format(f)] = (f in words)
return features

def sim_features(self, tokens):
words = set(tokens)
features = {}
for f in self._features:
features['sim({})'.format(f)] = self.similarity(words, f)
return features

def transform(self, X):
tokenize = lambda x: word_process(word_tokenize(x))
X_tokens = map(tokenize, X)
if self._model == 'onehot':
return map(self.unigram_features, X_tokens)
return map(self.sim_features, X_tokens)

class SimSVMClassifier:

def __init__(self, labels, model):
self._labels = labels
self._model = model

def train(cls, X, y, word_sim_metric, classifier=LinearSVC,
feature_num=10, feature_type='sim', verbose=True):

if isinstance(classifier, type):
classifier = classifier()

labels = LabelEncoder()
y_train = labels.fit_transform(y)

def build():

corpus = zip(X, y)
model = Pipeline([
('preprocessor', TextPreprocessor(corpus, word_sim_metric, feature_num, feature_type)),
('vectorizer', DictVectorizer()),
('classifier', classifier),
]), y_train)
return model

if verbose: print("Building the model")
model, secs = build()
if verbose: print("Complete model building in {:0.3f} seconds".format(secs))

return cls(labels, model)

def classify(self, X):
predicted = self._model.predict(X)
return list(self._labels.inverse_transform(predicted))

0 comments on commit 904b905

Please sign in to comment.