-
Notifications
You must be signed in to change notification settings - Fork 122
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This commit addds support for gitter backend.
Raw and Enriched indexes have been added along with their tests and schemas. Signed-off-by: Nitish Gupta <[email protected]>
- Loading branch information
1 parent
c6a65f3
commit fa7ce42
Showing
9 changed files
with
829 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2015-2020 Bitergia | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
# Authors: | ||
# Nitish Gupta <[email protected]> | ||
# | ||
|
||
import logging | ||
import re | ||
from urllib.parse import urlparse | ||
|
||
from grimoirelab_toolkit.datetime import str_to_datetime | ||
|
||
from .enrich import Enrich, metadata | ||
from ..elastic_mapping import Mapping as BaseMapping | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Mapping(BaseMapping): | ||
|
||
@staticmethod | ||
def get_elastic_mappings(es_major): | ||
"""Get Elasticsearch mapping. | ||
:param es_major: major version of Elasticsearch, as string | ||
:returns: dictionary with a key, 'items', with the mapping | ||
""" | ||
|
||
mapping = """ | ||
{ | ||
"properties": { | ||
"text_analyzed": { | ||
"type": "text", | ||
"fielddata": true, | ||
"index": true | ||
} | ||
} | ||
} """ | ||
|
||
return {"items": mapping} | ||
|
||
|
||
class GitterEnrich(Enrich): | ||
|
||
mapping = Mapping | ||
|
||
# REGEX to extract links from HTML text | ||
HTML_LINK_REGEX = re.compile("href=[\"\'](.*?)[\"\']") | ||
|
||
def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None, | ||
db_user='', db_password='', db_host=''): | ||
super().__init__(db_sortinghat, db_projects_map, json_projects_map, | ||
db_user, db_password, db_host) | ||
|
||
def get_field_author(self): | ||
return "fromUser" | ||
|
||
def get_sh_identity(self, item, identity_field=None): | ||
# email not available for gitter | ||
identity = { | ||
'username': None, | ||
'name': None, | ||
'email': None | ||
} | ||
|
||
if self.get_field_author() not in item['data']: | ||
return identity | ||
from_ = item['data'][self.get_field_author()] | ||
|
||
identity['username'] = from_.get('username', None) | ||
identity['name'] = from_.get('displayName', None) | ||
|
||
return identity | ||
|
||
def get_identities(self, item): | ||
""" Return the identities from an item """ | ||
|
||
identity = self.get_sh_identity(item) | ||
yield identity | ||
|
||
def get_project_repository(self, eitem): | ||
tokens = eitem['origin'].rsplit("/", 1) | ||
return tokens[0] + " " + tokens[1] | ||
|
||
@metadata | ||
def get_rich_item(self, item): | ||
|
||
eitem = {} | ||
|
||
for f in self.RAW_FIELDS_COPY: | ||
if f in item: | ||
eitem[f] = item[f] | ||
else: | ||
eitem[f] = None | ||
|
||
message = item['data'] | ||
|
||
eitem['unread'] = 1 if message['unread'] else 0 | ||
eitem['text_analyzed'] = message['text'] | ||
|
||
copy_fields = ["readBy", "issues", "id"] | ||
|
||
for f in copy_fields: | ||
if f in message: | ||
eitem[f] = message[f] | ||
else: | ||
eitem[f] = None | ||
|
||
eitem.update(self.get_rich_links(item['data'])) | ||
|
||
message_timestamp = str_to_datetime(eitem['metadata__updated_on']) | ||
eitem['tz'] = int(message_timestamp.strftime("%H")) | ||
|
||
if self.sortinghat: | ||
eitem.update(self.get_item_sh(item)) | ||
|
||
if self.prjs_map: | ||
eitem.update(self.get_item_project(eitem)) | ||
|
||
eitem.update(self.get_grimoire_fields(item["metadata__updated_on"], "message")) | ||
|
||
self.add_repository_labels(eitem) | ||
self.add_metadata_filter_raw(eitem) | ||
return eitem | ||
|
||
def get_rich_links(self, item): | ||
|
||
rich_item = {} | ||
|
||
if item['issues']: | ||
self.extract_issues(item['issues'], item['html']) | ||
|
||
if item['mentions']: | ||
rich_item['mentioned'] = self.extract_mentions(item['mentions']) | ||
|
||
rich_item['url_hostname'] = [] | ||
|
||
if item['urls']: | ||
for url in item['urls']: | ||
url_parsed = urlparse(url['url']) | ||
rich_item['url_hostname'].append('{uri.scheme}://{uri.netloc}/'.format(uri=url_parsed)) | ||
|
||
return rich_item | ||
|
||
def extract_issues(self, issue_pr, html_text): | ||
"""Enrich issues or PRs mentioned in the message""" | ||
|
||
links_found = self.HTML_LINK_REGEX.findall(html_text) | ||
for i, entity in enumerate(issue_pr): | ||
if 'repo' in entity.keys() and links_found: | ||
if links_found[i].split('/')[-2] == 'issues': | ||
entity['is_issue'] = entity['repo'] + ' #' + entity['number'] | ||
elif links_found[i].split('/')[-2] == 'pull': | ||
entity['is_pull'] = entity['repo'] + ' #' + entity['number'] | ||
else: | ||
continue | ||
entity['url'] = links_found[i] | ||
|
||
def extract_mentions(self, mentioned): | ||
"""Enrich users mentioned in the message""" | ||
|
||
rich_mentions = [] | ||
|
||
for usr in mentioned: | ||
if 'userId' in usr.keys(): | ||
rich_mentions.append({'mentioned_username': usr['screenName'], 'mentioned_userId': usr['userId']}) | ||
|
||
return rich_mentions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# Copyright (C) 2015-2020 Bitergia | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation; either version 3 of the License, or | ||
# (at your option) any later version. | ||
# | ||
# This program is distributed in the hope that it will be useful, | ||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
# GNU General Public License for more details. | ||
# | ||
# You should have received a copy of the GNU General Public License | ||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
# | ||
# Authors: | ||
# Nitish Gupta <[email protected]> | ||
# | ||
|
||
from .elastic import ElasticOcean | ||
from ..elastic_mapping import Mapping as BaseMapping | ||
|
||
|
||
class Mapping(BaseMapping): | ||
|
||
@staticmethod | ||
def get_elastic_mappings(es_major): | ||
"""Get Elasticsearch mapping. | ||
:param es_major: major version of Elasticsearch, as string | ||
:returns: dictionary with a key, 'items', with the mapping | ||
""" | ||
|
||
mapping = ''' | ||
{ | ||
"dynamic":true, | ||
"properties": { | ||
"data": { | ||
"dynamic":false, | ||
"properties": {} | ||
} | ||
} | ||
} | ||
''' | ||
|
||
return {"items": mapping} | ||
|
||
|
||
class GitterOcean(ElasticOcean): | ||
"""Gitter Ocean feeder""" | ||
|
||
mapping = Mapping | ||
|
||
@classmethod | ||
def get_perceval_params_from_url(cls, url): | ||
""" Get the perceval params given a URL for the data source """ | ||
|
||
params = [] | ||
|
||
org = url.split('/')[-2] | ||
room = url.split('/')[-1] | ||
params.append(org) | ||
params.append(room) | ||
return params |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
--- | ||
title: Add support for Gitter | ||
category: added | ||
author: Nitish Gupta <[email protected]> | ||
issue: 820 | ||
notes: Added support for creating raw and enriched | ||
indexes of message from Gitter. The visualizations | ||
and tests for extracted data have also been added. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
name,type,aggregatable,description | ||
author_bot,boolean,true,"True if the given author is identified as a bot." | ||
author_domain,keyword,true,"Author domain from Email. " | ||
author_gender,keyword,true,"Author gender. " | ||
author_id,keyword,true,"Author Id from SortingHat." | ||
author_multi_org_names,keyword,true,"List of the author organizations from SortingHat profile." | ||
author_name,keyword,true,"Author name." | ||
author_org_name,keyword,true,"Author organization name from SortingHat profile." | ||
author_user_name,keyword,true,"Author username for the platform" | ||
author_uuid,keyword,true,"Author UUID from SortingHat." | ||
fromUser_bot,boolean,true,"True if the given sender is identified as a bot." | ||
fromUser_domain,keyword,true,"Sender domain from Email. " | ||
fromUser_gender,keyword,true,"Author gender. " | ||
fromUser_id,keyword,true,"Sender Id from SortingHat." | ||
fromUser_multi_org_names,keyword,true,"List of the sender organizations from SortingHat profile." | ||
fromUser_name,keyword,true,"Sender name." | ||
fromUser_org_name,keyword,true,"Sender organization name from SortingHat profile." | ||
fromUser_user_name,keyword,true,"Sender username for the platform" | ||
fromUser_uuid,keyword,true,"Sender UUID from SortingHat." | ||
grimoire_creation_date,date,true,"Message date (when the original author sent the message)." | ||
id,keyword,true,"Message sender user ID for gitter. " | ||
is_gitter_message,boolean,true,"True if the item is a Gitter message. " | ||
issues,list,true,"List of issues mentioned in the message, Empty if no issues were mentioned. " | ||
mentioned,list,true,"List of users mentioned in the message, Empty if no user is mentioned. " | ||
metadata__enriched_on,date,true,"Date when the item was enriched." | ||
metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information." | ||
metadata__gelk_version,keyword,true,"Version of the backend used to enrich information." | ||
metadata__timestamp,date,true,"Date when the item was stored in RAW index." | ||
metadata__updated_on,date,true,"Date when the item was updated in its original data source." | ||
origin,keyword,true,"Original URL where the room was retrieved from." | ||
project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)" | ||
project,keyword,true,"Project." | ||
readBy,int,true,"Count of users who have read the message" | ||
repository_labels,keyword,true,"Custom repository labels defined by the user." | ||
tag,keyword,true,"Perceval tag." | ||
text_analyzed,keyword,true,"Message body in plain text. " | ||
tz,int,true,"Time of the day the message was sent. " | ||
unread,boolean,true,"True if the message is unread by user, False if read. " | ||
url_hostname,list,true,"List of URL hostnames of the URL(s) mentioned in the message." | ||
uuid,keyword,true,"Perceval UUID." |
Oops, something went wrong.