Skip to content

Commit

Permalink
This commit addds support for gitter backend.
Browse files Browse the repository at this point in the history
Raw and Enriched indexes have been added along with their tests and schemas.

Signed-off-by: Nitish Gupta <[email protected]>
  • Loading branch information
imnitishng authored and valeriocos committed Apr 16, 2020
1 parent c6a65f3 commit fa7ce42
Show file tree
Hide file tree
Showing 9 changed files with 829 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ Each enriched index includes one or more types of documents, which are summarize
- **GitHub repo statistics**: each document includes repo statistics (e.g., forks, watchers).
- **GitLab issues**: each document corresponds to an issue.
- **GitLab merge requests**: each document corresponds to a merge request.
- **Gitter**: each document corresponds to a message.
- **Googlehits**: each document contains hits information derived from Google.
- **Groupsio**: each document corresponds to a message.
- **Hyperkitty**: each document corresponds to a message.
Expand Down
184 changes: 184 additions & 0 deletions grimoire_elk/enriched/gitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Nitish Gupta <[email protected]>
#

import logging
import re
from urllib.parse import urlparse

from grimoirelab_toolkit.datetime import str_to_datetime

from .enrich import Enrich, metadata
from ..elastic_mapping import Mapping as BaseMapping


logger = logging.getLogger(__name__)


class Mapping(BaseMapping):

@staticmethod
def get_elastic_mappings(es_major):
"""Get Elasticsearch mapping.
:param es_major: major version of Elasticsearch, as string
:returns: dictionary with a key, 'items', with the mapping
"""

mapping = """
{
"properties": {
"text_analyzed": {
"type": "text",
"fielddata": true,
"index": true
}
}
} """

return {"items": mapping}


class GitterEnrich(Enrich):

mapping = Mapping

# REGEX to extract links from HTML text
HTML_LINK_REGEX = re.compile("href=[\"\'](.*?)[\"\']")

def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
db_user='', db_password='', db_host=''):
super().__init__(db_sortinghat, db_projects_map, json_projects_map,
db_user, db_password, db_host)

def get_field_author(self):
return "fromUser"

def get_sh_identity(self, item, identity_field=None):
# email not available for gitter
identity = {
'username': None,
'name': None,
'email': None
}

if self.get_field_author() not in item['data']:
return identity
from_ = item['data'][self.get_field_author()]

identity['username'] = from_.get('username', None)
identity['name'] = from_.get('displayName', None)

return identity

def get_identities(self, item):
""" Return the identities from an item """

identity = self.get_sh_identity(item)
yield identity

def get_project_repository(self, eitem):
tokens = eitem['origin'].rsplit("/", 1)
return tokens[0] + " " + tokens[1]

@metadata
def get_rich_item(self, item):

eitem = {}

for f in self.RAW_FIELDS_COPY:
if f in item:
eitem[f] = item[f]
else:
eitem[f] = None

message = item['data']

eitem['unread'] = 1 if message['unread'] else 0
eitem['text_analyzed'] = message['text']

copy_fields = ["readBy", "issues", "id"]

for f in copy_fields:
if f in message:
eitem[f] = message[f]
else:
eitem[f] = None

eitem.update(self.get_rich_links(item['data']))

message_timestamp = str_to_datetime(eitem['metadata__updated_on'])
eitem['tz'] = int(message_timestamp.strftime("%H"))

if self.sortinghat:
eitem.update(self.get_item_sh(item))

if self.prjs_map:
eitem.update(self.get_item_project(eitem))

eitem.update(self.get_grimoire_fields(item["metadata__updated_on"], "message"))

self.add_repository_labels(eitem)
self.add_metadata_filter_raw(eitem)
return eitem

def get_rich_links(self, item):

rich_item = {}

if item['issues']:
self.extract_issues(item['issues'], item['html'])

if item['mentions']:
rich_item['mentioned'] = self.extract_mentions(item['mentions'])

rich_item['url_hostname'] = []

if item['urls']:
for url in item['urls']:
url_parsed = urlparse(url['url'])
rich_item['url_hostname'].append('{uri.scheme}://{uri.netloc}/'.format(uri=url_parsed))

return rich_item

def extract_issues(self, issue_pr, html_text):
"""Enrich issues or PRs mentioned in the message"""

links_found = self.HTML_LINK_REGEX.findall(html_text)
for i, entity in enumerate(issue_pr):
if 'repo' in entity.keys() and links_found:
if links_found[i].split('/')[-2] == 'issues':
entity['is_issue'] = entity['repo'] + ' #' + entity['number']
elif links_found[i].split('/')[-2] == 'pull':
entity['is_pull'] = entity['repo'] + ' #' + entity['number']
else:
continue
entity['url'] = links_found[i]

def extract_mentions(self, mentioned):
"""Enrich users mentioned in the message"""

rich_mentions = []

for usr in mentioned:
if 'userId' in usr.keys():
rich_mentions.append({'mentioned_username': usr['screenName'], 'mentioned_userId': usr['userId']})

return rich_mentions
66 changes: 66 additions & 0 deletions grimoire_elk/raw/gitter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015-2020 Bitergia
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Authors:
# Nitish Gupta <[email protected]>
#

from .elastic import ElasticOcean
from ..elastic_mapping import Mapping as BaseMapping


class Mapping(BaseMapping):

@staticmethod
def get_elastic_mappings(es_major):
"""Get Elasticsearch mapping.
:param es_major: major version of Elasticsearch, as string
:returns: dictionary with a key, 'items', with the mapping
"""

mapping = '''
{
"dynamic":true,
"properties": {
"data": {
"dynamic":false,
"properties": {}
}
}
}
'''

return {"items": mapping}


class GitterOcean(ElasticOcean):
"""Gitter Ocean feeder"""

mapping = Mapping

@classmethod
def get_perceval_params_from_url(cls, url):
""" Get the perceval params given a URL for the data source """

params = []

org = url.split('/')[-2]
room = url.split('/')[-1]
params.append(org)
params.append(room)
return params
4 changes: 4 additions & 0 deletions grimoire_elk/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from perceval.backends.core.git import Git, GitCommand
from perceval.backends.core.github import GitHub, GitHubCommand
from perceval.backends.core.gitlab import GitLab, GitLabCommand
from perceval.backends.core.gitter import Gitter, GitterCommand
from perceval.backends.core.googlehits import GoogleHits, GoogleHitsCommand
from perceval.backends.core.groupsio import Groupsio, GroupsioCommand
from perceval.backends.core.hyperkitty import HyperKitty, HyperKittyCommand
Expand Down Expand Up @@ -91,6 +92,7 @@
from .enriched.github import GitHubEnrich
from .enriched.github2 import GitHubEnrich2
from .enriched.gitlab import GitLabEnrich
from .enriched.gitter import GitterEnrich
from .enriched.google_hits import GoogleHitsEnrich
from .enriched.groupsio import GroupsioEnrich
from .enriched.hyperkitty import HyperKittyEnrich
Expand Down Expand Up @@ -130,6 +132,7 @@
from .raw.git import GitOcean
from .raw.github import GitHubOcean
from .raw.gitlab import GitLabOcean
from .raw.gitter import GitterOcean
from .raw.google_hits import GoogleHitsOcean
from .raw.graal import GraalOcean
from .raw.groupsio import GroupsioOcean
Expand Down Expand Up @@ -228,6 +231,7 @@ def get_connectors():
"github": [GitHub, GitHubOcean, GitHubEnrich, GitHubCommand],
"github2": [GitHub, GitHubOcean, GitHubEnrich2, GitHubCommand],
"gitlab": [GitLab, GitLabOcean, GitLabEnrich, GitLabCommand],
"gitter": [Gitter, GitterOcean, GitterEnrich, GitterCommand],
"google_hits": [GoogleHits, GoogleHitsOcean, GoogleHitsEnrich, GoogleHitsCommand],
"groupsio": [Groupsio, GroupsioOcean, GroupsioEnrich, GroupsioCommand],
"hyperkitty": [HyperKitty, HyperKittyOcean, HyperKittyEnrich, HyperKittyCommand],
Expand Down
8 changes: 8 additions & 0 deletions releases/unreleased/add-support-for-gitter.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
title: Add support for Gitter
category: added
author: Nitish Gupta <[email protected]>
issue: 820
notes: Added support for creating raw and enriched
indexes of message from Gitter. The visualizations
and tests for extracted data have also been added.
40 changes: 40 additions & 0 deletions schema/gitter.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name,type,aggregatable,description
author_bot,boolean,true,"True if the given author is identified as a bot."
author_domain,keyword,true,"Author domain from Email. "
author_gender,keyword,true,"Author gender. "
author_id,keyword,true,"Author Id from SortingHat."
author_multi_org_names,keyword,true,"List of the author organizations from SortingHat profile."
author_name,keyword,true,"Author name."
author_org_name,keyword,true,"Author organization name from SortingHat profile."
author_user_name,keyword,true,"Author username for the platform"
author_uuid,keyword,true,"Author UUID from SortingHat."
fromUser_bot,boolean,true,"True if the given sender is identified as a bot."
fromUser_domain,keyword,true,"Sender domain from Email. "
fromUser_gender,keyword,true,"Author gender. "
fromUser_id,keyword,true,"Sender Id from SortingHat."
fromUser_multi_org_names,keyword,true,"List of the sender organizations from SortingHat profile."
fromUser_name,keyword,true,"Sender name."
fromUser_org_name,keyword,true,"Sender organization name from SortingHat profile."
fromUser_user_name,keyword,true,"Sender username for the platform"
fromUser_uuid,keyword,true,"Sender UUID from SortingHat."
grimoire_creation_date,date,true,"Message date (when the original author sent the message)."
id,keyword,true,"Message sender user ID for gitter. "
is_gitter_message,boolean,true,"True if the item is a Gitter message. "
issues,list,true,"List of issues mentioned in the message, Empty if no issues were mentioned. "
mentioned,list,true,"List of users mentioned in the message, Empty if no user is mentioned. "
metadata__enriched_on,date,true,"Date when the item was enriched."
metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information."
metadata__gelk_version,keyword,true,"Version of the backend used to enrich information."
metadata__timestamp,date,true,"Date when the item was stored in RAW index."
metadata__updated_on,date,true,"Date when the item was updated in its original data source."
origin,keyword,true,"Original URL where the room was retrieved from."
project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)"
project,keyword,true,"Project."
readBy,int,true,"Count of users who have read the message"
repository_labels,keyword,true,"Custom repository labels defined by the user."
tag,keyword,true,"Perceval tag."
text_analyzed,keyword,true,"Message body in plain text. "
tz,int,true,"Time of the day the message was sent. "
unread,boolean,true,"True if the message is unread by user, False if read. "
url_hostname,list,true,"List of URL hostnames of the URL(s) mentioned in the message."
uuid,keyword,true,"Perceval UUID."
Loading

0 comments on commit fa7ce42

Please sign in to comment.