This commit addds support for gitter backend.

Raw and Enriched indexes have been added along with their tests and schemas. Signed-off-by: Nitish Gupta <[email protected]>
chaoss · Apr 16, 2020 · fa7ce42 · fa7ce42
1 parent c6a65f3
commit fa7ce42
Show file tree

Hide file tree

Showing 9 changed files with 829 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ Each enriched index includes one or more types of documents, which are summarize
 - **GitHub repo statistics**: each document includes repo statistics (e.g., forks, watchers).
 - **GitLab issues**: each document corresponds to an issue.
 - **GitLab merge requests**: each document corresponds to a merge request.
+- **Gitter**: each document corresponds to a message.
 - **Googlehits**: each document contains hits information derived from Google.
 - **Groupsio**: each document corresponds to a message.
 - **Hyperkitty**: each document corresponds to a message.

diff --git a/grimoire_elk/enriched/gitter.py b/grimoire_elk/enriched/gitter.py
@@ -0,0 +1,184 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2020 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# Authors:
+#   Nitish Gupta <[email protected]>
+#
+
+import logging
+import re
+from urllib.parse import urlparse
+
+from grimoirelab_toolkit.datetime import str_to_datetime
+
+from .enrich import Enrich, metadata
+from ..elastic_mapping import Mapping as BaseMapping
+
+
+logger = logging.getLogger(__name__)
+
+
+class Mapping(BaseMapping):
+
+    @staticmethod
+    def get_elastic_mappings(es_major):
+        """Get Elasticsearch mapping.
+
+        :param es_major: major version of Elasticsearch, as string
+        :returns:        dictionary with a key, 'items', with the mapping
+        """
+
+        mapping = """
+        {
+            "properties": {
+                "text_analyzed": {
+                  "type": "text",
+                  "fielddata": true,
+                  "index": true
+                }
+           }
+        } """
+
+        return {"items": mapping}
+
+
+class GitterEnrich(Enrich):
+
+    mapping = Mapping
+
+    # REGEX to extract links from HTML text
+    HTML_LINK_REGEX = re.compile("href=[\"\'](.*?)[\"\']")
+
+    def __init__(self, db_sortinghat=None, db_projects_map=None, json_projects_map=None,
+                 db_user='', db_password='', db_host=''):
+        super().__init__(db_sortinghat, db_projects_map, json_projects_map,
+                         db_user, db_password, db_host)
+
+    def get_field_author(self):
+        return "fromUser"
+
+    def get_sh_identity(self, item, identity_field=None):
+        # email not available for gitter
+        identity = {
+            'username': None,
+            'name': None,
+            'email': None
+        }
+
+        if self.get_field_author() not in item['data']:
+            return identity
+        from_ = item['data'][self.get_field_author()]
+
+        identity['username'] = from_.get('username', None)
+        identity['name'] = from_.get('displayName', None)
+
+        return identity
+
+    def get_identities(self, item):
+        """ Return the identities from an item """
+
+        identity = self.get_sh_identity(item)
+        yield identity
+
+    def get_project_repository(self, eitem):
+        tokens = eitem['origin'].rsplit("/", 1)
+        return tokens[0] + " " + tokens[1]
+
+    @metadata
+    def get_rich_item(self, item):
+
+        eitem = {}
+
+        for f in self.RAW_FIELDS_COPY:
+            if f in item:
+                eitem[f] = item[f]
+            else:
+                eitem[f] = None
+
+        message = item['data']
+
+        eitem['unread'] = 1 if message['unread'] else 0
+        eitem['text_analyzed'] = message['text']
+
+        copy_fields = ["readBy", "issues", "id"]
+
+        for f in copy_fields:
+            if f in message:
+                eitem[f] = message[f]
+            else:
+                eitem[f] = None
+
+        eitem.update(self.get_rich_links(item['data']))
+
+        message_timestamp = str_to_datetime(eitem['metadata__updated_on'])
+        eitem['tz'] = int(message_timestamp.strftime("%H"))
+
+        if self.sortinghat:
+            eitem.update(self.get_item_sh(item))
+
+        if self.prjs_map:
+            eitem.update(self.get_item_project(eitem))
+
+        eitem.update(self.get_grimoire_fields(item["metadata__updated_on"], "message"))
+
+        self.add_repository_labels(eitem)
+        self.add_metadata_filter_raw(eitem)
+        return eitem
+
+    def get_rich_links(self, item):
+
+        rich_item = {}
+
+        if item['issues']:
+            self.extract_issues(item['issues'], item['html'])
+
+        if item['mentions']:
+            rich_item['mentioned'] = self.extract_mentions(item['mentions'])
+
+        rich_item['url_hostname'] = []
+
+        if item['urls']:
+            for url in item['urls']:
+                url_parsed = urlparse(url['url'])
+                rich_item['url_hostname'].append('{uri.scheme}://{uri.netloc}/'.format(uri=url_parsed))
+
+        return rich_item
+
+    def extract_issues(self, issue_pr, html_text):
+        """Enrich issues or PRs mentioned in the message"""
+
+        links_found = self.HTML_LINK_REGEX.findall(html_text)
+        for i, entity in enumerate(issue_pr):
+            if 'repo' in entity.keys() and links_found:
+                if links_found[i].split('/')[-2] == 'issues':
+                    entity['is_issue'] = entity['repo'] + ' #' + entity['number']
+                elif links_found[i].split('/')[-2] == 'pull':
+                    entity['is_pull'] = entity['repo'] + ' #' + entity['number']
+                else:
+                    continue
+                entity['url'] = links_found[i]
+
+    def extract_mentions(self, mentioned):
+        """Enrich users mentioned in the message"""
+
+        rich_mentions = []
+
+        for usr in mentioned:
+            if 'userId' in usr.keys():
+                rich_mentions.append({'mentioned_username': usr['screenName'], 'mentioned_userId': usr['userId']})
+
+        return rich_mentions
diff --git a/grimoire_elk/raw/gitter.py b/grimoire_elk/raw/gitter.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2015-2020 Bitergia
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# Authors:
+#   Nitish Gupta <[email protected]>
+#
+
+from .elastic import ElasticOcean
+from ..elastic_mapping import Mapping as BaseMapping
+
+
+class Mapping(BaseMapping):
+
+    @staticmethod
+    def get_elastic_mappings(es_major):
+        """Get Elasticsearch mapping.
+
+        :param es_major: major version of Elasticsearch, as string
+        :returns:        dictionary with a key, 'items', with the mapping
+        """
+
+        mapping = '''
+             {
+                "dynamic":true,
+                "properties": {
+                    "data": {
+                        "dynamic":false,
+                        "properties": {}
+                    }
+                }
+            }
+            '''
+
+        return {"items": mapping}
+
+
+class GitterOcean(ElasticOcean):
+    """Gitter Ocean feeder"""
+
+    mapping = Mapping
+
+    @classmethod
+    def get_perceval_params_from_url(cls, url):
+        """ Get the perceval params given a URL for the data source """
+
+        params = []
+
+        org = url.split('/')[-2]
+        room = url.split('/')[-1]
+        params.append(org)
+        params.append(room)
+        return params
diff --git a/grimoire_elk/utils.py b/grimoire_elk/utils.py
@@ -46,6 +46,7 @@
 from perceval.backends.core.git import Git, GitCommand
 from perceval.backends.core.github import GitHub, GitHubCommand
 from perceval.backends.core.gitlab import GitLab, GitLabCommand
+from perceval.backends.core.gitter import Gitter, GitterCommand
 from perceval.backends.core.googlehits import GoogleHits, GoogleHitsCommand
 from perceval.backends.core.groupsio import Groupsio, GroupsioCommand
 from perceval.backends.core.hyperkitty import HyperKitty, HyperKittyCommand
@@ -91,6 +92,7 @@
 from .enriched.github import GitHubEnrich
 from .enriched.github2 import GitHubEnrich2
 from .enriched.gitlab import GitLabEnrich
+from .enriched.gitter import GitterEnrich
 from .enriched.google_hits import GoogleHitsEnrich
 from .enriched.groupsio import GroupsioEnrich
 from .enriched.hyperkitty import HyperKittyEnrich
@@ -130,6 +132,7 @@
 from .raw.git import GitOcean
 from .raw.github import GitHubOcean
 from .raw.gitlab import GitLabOcean
+from .raw.gitter import GitterOcean
 from .raw.google_hits import GoogleHitsOcean
 from .raw.graal import GraalOcean
 from .raw.groupsio import GroupsioOcean
@@ -228,6 +231,7 @@ def get_connectors():
             "github": [GitHub, GitHubOcean, GitHubEnrich, GitHubCommand],
             "github2": [GitHub, GitHubOcean, GitHubEnrich2, GitHubCommand],
             "gitlab": [GitLab, GitLabOcean, GitLabEnrich, GitLabCommand],
+            "gitter": [Gitter, GitterOcean, GitterEnrich, GitterCommand],
             "google_hits": [GoogleHits, GoogleHitsOcean, GoogleHitsEnrich, GoogleHitsCommand],
             "groupsio": [Groupsio, GroupsioOcean, GroupsioEnrich, GroupsioCommand],
             "hyperkitty": [HyperKitty, HyperKittyOcean, HyperKittyEnrich, HyperKittyCommand],

diff --git a/releases/unreleased/add-support-for-gitter.yml b/releases/unreleased/add-support-for-gitter.yml
@@ -0,0 +1,8 @@
+---
+title: Add support for Gitter
+category: added
+author: Nitish Gupta <[email protected]>
+issue: 820
+notes: Added support for creating raw and enriched 
+       indexes of message from Gitter. The visualizations 
+       and tests for extracted data have also been added.
diff --git a/schema/gitter.csv b/schema/gitter.csv
@@ -0,0 +1,40 @@
+name,type,aggregatable,description
+author_bot,boolean,true,"True if the given author is identified as a bot."
+author_domain,keyword,true,"Author domain from Email. "
+author_gender,keyword,true,"Author gender. "
+author_id,keyword,true,"Author Id from SortingHat."
+author_multi_org_names,keyword,true,"List of the author organizations from SortingHat profile."
+author_name,keyword,true,"Author name."
+author_org_name,keyword,true,"Author organization name from SortingHat profile."
+author_user_name,keyword,true,"Author username for the platform"
+author_uuid,keyword,true,"Author UUID from SortingHat."
+fromUser_bot,boolean,true,"True if the given sender is identified as a bot."
+fromUser_domain,keyword,true,"Sender domain from Email. "
+fromUser_gender,keyword,true,"Author gender. "
+fromUser_id,keyword,true,"Sender Id from SortingHat."
+fromUser_multi_org_names,keyword,true,"List of the sender organizations from SortingHat profile."
+fromUser_name,keyword,true,"Sender name."
+fromUser_org_name,keyword,true,"Sender organization name from SortingHat profile."
+fromUser_user_name,keyword,true,"Sender username for the platform"
+fromUser_uuid,keyword,true,"Sender UUID from SortingHat."
+grimoire_creation_date,date,true,"Message date (when the original author sent the message)."
+id,keyword,true,"Message sender user ID for gitter. "
+is_gitter_message,boolean,true,"True if the item is a Gitter message. "
+issues,list,true,"List of issues mentioned in the message, Empty if no issues were mentioned. "
+mentioned,list,true,"List of users mentioned in the message, Empty if no user is mentioned. "
+metadata__enriched_on,date,true,"Date when the item was enriched."
+metadata__gelk_backend_name,keyword,true,"Name of the backend used to enrich information."
+metadata__gelk_version,keyword,true,"Version of the backend used to enrich information."
+metadata__timestamp,date,true,"Date when the item was stored in RAW index."
+metadata__updated_on,date,true,"Date when the item was updated in its original data source."
+origin,keyword,true,"Original URL where the room was retrieved from."
+project_1,keyword,true,"Project (if more than one level is allowed in project hierarchy)"
+project,keyword,true,"Project."
+readBy,int,true,"Count of users who have read the message"
+repository_labels,keyword,true,"Custom repository labels defined by the user."
+tag,keyword,true,"Perceval tag."
+text_analyzed,keyword,true,"Message body in plain text. "
+tz,int,true,"Time of the day the message was sent. "
+unread,boolean,true,"True if the message is unread by user, False if read. "
+url_hostname,list,true,"List of URL hostnames of the URL(s) mentioned in the message."
+uuid,keyword,true,"Perceval UUID."