From abc397e934c027fb0860fe84f7949d21cc3f4393 Mon Sep 17 00:00:00 2001 From: tamajongnc <43474587+tamajongnc@users.noreply.github.com> Date: Tue, 16 Jul 2019 16:54:45 -0400 Subject: [PATCH 1/5] Update perspective.py --- antidox/perspective.py | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/antidox/perspective.py b/antidox/perspective.py index 9ca0897f..654ce696 100644 --- a/antidox/perspective.py +++ b/antidox/perspective.py @@ -7,6 +7,7 @@ import argparse import json import sys +from google.cloud import bigquery from googleapiclient import discovery from googleapiclient import errors as google_api_errors import pandas as pd @@ -75,10 +76,10 @@ def dlp_request(dlp, apikey_data, comment): "name":"PASSPORT" }, { - "name":"PERSON_NAME" + "name":"GCP_CREDENTIALS" }, { - "name":"ALL_BASIC" + "name":"SWIFT_CODE" } ], "minLikelihood":"POSSIBLE", @@ -119,6 +120,22 @@ def contains_toxicity(perspective_response): is_toxic = True return is_toxic +def contains_threat(perspective_response): + """Checking/returning comments with a threat value of over 50 percent.""" + is_threat = False + if (perspective_response['attributeScores']['THREAT']['summaryScore'] + ['value'] >= .5): + is_threat = True + return is_threat + +def contains_insult(perspective_response): + """Checking/returning comments with an insult value of over 50 percent.""" + is_insult = False + if (perspective_response['attributeScores']['INSULT']['summaryScore'] + ['value'] >= .5): + is_insult = True + return is_insult + def get_wikipage(pagename): """ Gets all content from a wikipedia page and turns it into plain text. """ @@ -223,6 +240,7 @@ class GetToxicity(beam.DoFn): # pylint: disable=fixme, inconsistent-return-statements def process(self, element): """Runs every element of collection through perspective and dlp""" + print(repr(element)) print('==============================================\n') if not element: @@ -233,15 +251,23 @@ def process(self, element): perspective_response = perspective_request(perspective, element) has_pii_bool, pii_type = contains_pii(dlp_response) if has_pii_bool: - pii = [element+"\n"+'contains pii?'+"Yes"+"\n"+str(pii_type)+"\n" \ - +"==============================================="+"\n"] + pii = (json.dumps({"comment_text":element, "contains_pii": True, "pii_type":pii_type})+"\n") return pii if contains_toxicity(perspective_response): - tox = [element+"\n" +"contains TOXICITY?:"+"Yes" - +"\n"+str(perspective_response['attributeScores'] - ['TOXICITY']['summaryScore']['value'])+"\n" - +"=========================================="+"\n"] + tox = (json.dumps({"comment_text":element, "contains_toxicity": True, + "summaryScore":perspective_response['attributeScores'] + ['TOXICITY']['summaryScore']['value']})+"\n") return tox + if contains_threat(perspective_response): + threat = (json.dumps({"comment_text":element, "contains_threat": True, + "summaryScore":perspective_response['attributeScores'] + ['THREAT']['summaryScore']['value']})+"\n") + return threat + if contains_insult(perspective_response): + insult = (json.dumps({"comment_text":element, "contains_insult": True, + "summaryScore":perspective_response['attributeScores'] + ['INSULT']['summaryScore']['value']})+"\n") + return insult except google_api_errors.HttpError as err: print('error', err) results = comments \ From eade63528e370bca7e3927c703298a35b9cbbf49 Mon Sep 17 00:00:00 2001 From: Daniel Borkan Date: Tue, 16 Jul 2019 17:18:22 -0400 Subject: [PATCH 2/5] Reset wikiwatcher.py --- antidox/wikiwatcher.py | 107 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/antidox/wikiwatcher.py b/antidox/wikiwatcher.py index 44c8f800..b108dc19 100644 --- a/antidox/wikiwatcher.py +++ b/antidox/wikiwatcher.py @@ -4,21 +4,95 @@ from __future__ import division from __future__ import print_function -import argparse import json - +import pprint +import argparse +import pywikibot +import requests import sseclient +from googleapiclient import errors as google_api_errors + +from antidox import clean +from antidox import perspective -def log_event(change): +# pylint: disable=fixme, too-many-locals +def log_event(apikey_data, toxicity, dlp, change): """Logs event by printing. Args: change: a json object with the wikimedia change record. """ - print( - u'user:{user} namespace:{namespace} bot:{bot} comment:{comment} title:{title}' - .format(**change)) + # print( + # u'user:{user} namespace:{namespace} bot:{bot} comment:{comment} title:{title}' + # .format(**change)) + # print('\n########## change:') + from_id = (str(change['revision']['old'])) + to_id = (str(change['revision']['new'])) + page = ('https://en.wikipedia.org/w/api.php?action=compare&fromrev=' + + from_id + '&torev=' + to_id + '&format=json') + get_page = requests.get(page) + response = json.loads(get_page.content.decode('utf-8')) + revision = response['compare']['*'] + + text = clean.content_clean(revision) + + # for line in text: + print(text) + if not text: + return + dlp_response = perspective.dlp_request(dlp, apikey_data, text) + try: + perspective_response = perspective.perspective_request(toxicity, text) + # Perspective can't handle language errors at this time + except google_api_errors.HttpError as err: + print('Error:', err) + return + has_pii_bool, pii_type = perspective.contains_pii(dlp_response) + if has_pii_bool: + header = '==Possible Doxxing Detected: Waiting for review==' + result = ( + u'{' + 'user:{user}, namespace:{namespace}, bot:{bot}, comment:{comment}' + + 'title:{title},'.format(**change) + ', ' + 'comment_text:' + str(text) + + ', ' + 'contains_pii:' + 'True' + ', ' + 'pii_type:' + str(pii_type) + + ', ' + '}' + '\n') + wiki_write(result, header) + + if perspective.contains_toxicity(perspective_response): + header = '==Possibly Toxic Detected: Waiting for review==' + result = ( + u'{' + 'user:{user}, namespace:{namespace}, bot:{bot}, comment:{comment}' + + 'title:{title}'.format(**change) + ', ' + 'comment_text:' + str(text) + + ', ' + 'contains_toxicity:' + 'True' + ', ' + 'toxic_score:' + + str(perspective_response['attributeScores'] + + ['TOXICITY']['summaryScore']['value']) + ', ' + '}' + '\n') + wiki_write(result, header) + + +def wiki_write(result, header): + site = pywikibot.Site() + repo = site.data_repository() + page = pywikibot.Page(site, u'User_talk:DoxDetective') + + heading = (header) + content = (result) + message = '\n\n{}\n{} --~~~~'.format(heading, content) + page.save( + summary='Testing', + watch=None, + minor=False, + botflag=True, + force=False, + async=False, + callback=None, + apply_cosmetic_changes=None, + appendtext=message) def watcher(event_source, wiki_filter, namespaces_filter, callback): @@ -32,13 +106,22 @@ def watcher(event_source, wiki_filter, namespaces_filter, callback): """ for event in event_source: if event.event == 'message' and event.data: - change = json.loads(event.data) + try: + change = json.loads(event.data) + except json.decoder.JSONDecodeError as err: + print('Error:', err) + pprint.pprint(event.data) + continue if change['bot']: continue if change['wiki'] != wiki_filter: continue if change['namespace'] not in namespaces_filter: continue + if 'revision' not in change: + continue + if 'old' not in change['revision']: + continue callback(change) @@ -49,7 +132,8 @@ def watcher(event_source, wiki_filter, namespaces_filter, callback): parser.add_argument( '--namespaces', default='1,3', - help='Namespaces defined in http://phabricator.wikimedia.org/source/mediawiki/browse/master/includes/Defines.php separated by commas.' + help='Namespaces defined in http://phabricator.wikimedia.' + + 'org/source/mediawiki/browse/master/includes/Defines.php separated by commas.' ) parser.add_argument( '--url', @@ -60,4 +144,9 @@ def watcher(event_source, wiki_filter, namespaces_filter, callback): namespaces = set([int(ns) for ns in args.namespaces.split(',')]) client = sseclient.SSEClient(args.url) - watcher(client, args.wiki_filter, namespaces, log_event) + apikey_data, toxicity, dlp = perspective.get_client() + + def log_change(change): + return log_event(apikey_data, toxicity, dlp, change) + + watcher(client, args.wiki_filter, namespaces, log_change) From 264f2fe051082fcc769b1a1d8e1b945151c39c86 Mon Sep 17 00:00:00 2001 From: Tamajong Nchukwi Date: Tue, 16 Jul 2019 17:27:31 -0400 Subject: [PATCH 3/5] Set up distibuted system --- antidox/perspective.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/antidox/perspective.py b/antidox/perspective.py index 9ca0897f..61784947 100644 --- a/antidox/perspective.py +++ b/antidox/perspective.py @@ -1,7 +1,6 @@ """ inputs comments to perspective and dlp apis and detects toxicity and personal information> has support for csv files, bigquery tables, and wikipedia talk pages""" -#TODO(tamajongnc): configure pipeline to distribute work to multiple machines # pylint: disable=fixme, import-error # pylint: disable=fixme, unused-import import argparse @@ -11,14 +10,16 @@ from googleapiclient import errors as google_api_errors import pandas as pd import requests -import clean import apache_beam as beam from apache_beam.io.gcp.internal.clients import bigquery from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.options.pipeline_options import SetupOptions from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions +from apache_beam.options.pipeline_options import WorkerOptions from apache_beam import window +import clean + def get_client(): @@ -159,7 +160,7 @@ def main(argv): parser.add_argument('--wiki_pagename', help='insert the talk page name') parser.add_argument('--content', help='specify a column in dataset to retreive data from') parser.add_argument('--output', help='path for output file in cloud bucket') - parser.add_argument('--nd_output' help='gcs path to store ndjson results') + parser.add_argument('--nd_output', help='gcs path to store ndjson results') parser.add_argument('--project', help='project id for bigquery table', \ default='wikidetox-viz') parser.add_argument('--gproject', help='gcp project id') @@ -169,9 +170,11 @@ def main(argv): apikey_data, perspective, dlp = get_client() options = PipelineOptions(pipe_args) gcloud_options = options.view_as(GoogleCloudOptions) - gcloud_options.project = args.gproject - gcloud_options.temp_location = args.temp_location + gcloud_options.project = 'google.com:new-project-242016' + gcloud_options.staging_location = 'gs://tj_cloud_bucket/stage' + gcloud_options.temp_location = 'gs://tj_cloud_bucket/tmp' options.view_as(StandardOptions).runner = 'dataflow' + options.view_as(WorkerOptions).num_workers = 100 options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as pipeline: if args.wiki_pagename: @@ -213,7 +216,6 @@ def process(self, element): return [json.dumps(data) + '\n'] except google_api_errors.HttpError as err: print('error', err) - print('done') # pylint: disable=fixme, too-few-public-methods class GetToxicity(beam.DoFn): @@ -250,8 +252,8 @@ def process(self, element): | beam.ParDo(NDjson()) # pylint: disable=fixme, expression-not-assigned results | 'WriteToText' >> beam.io.WriteToText( - args.output) + 'gs://tj_cloud_bucket/beam.txt') json_results | 'WriteToText2' >> beam.io.WriteToText( - args.nd_ouput) + 'gs://tj_cloud_bucket/results.json') if __name__ == '__main__': main(sys.argv[1:]) From a6409fedaa10d3b69c6ddcf4d5525a2006d44875 Mon Sep 17 00:00:00 2001 From: Tamajong Nchukwi Date: Tue, 16 Jul 2019 17:31:24 -0400 Subject: [PATCH 4/5] Set up distibuted system --- antidox/perspective_test.py | 158 ++++++++++++++++++++++++------------ 1 file changed, 107 insertions(+), 51 deletions(-) diff --git a/antidox/perspective_test.py b/antidox/perspective_test.py index c93bb5bb..d8e4961e 100644 --- a/antidox/perspective_test.py +++ b/antidox/perspective_test.py @@ -1,54 +1,59 @@ -import perspective -import unittest -from unittest import mock +# -*- coding: utf-8 -*- +from antidox import perspective +import unittest +import sys +if sys.version_info >= (3, 3): + from unittest import mock +else: + import mock class TestPerspective(unittest.TestCase): def test_contains_pii_true(self): dlp_response = \ - { - "result": { - "findings": [ - { - "quote": "footfungusinbellybutton@gmail.com", - "infoType": { - "name": "EMAIL_ADDRESS" - }, - "likelihood": "LIKELY", - "location": { - "byteRange": { - "start": "13", - "end": "46" - }, - "codepointRange": { - "start": "13", - "end": "46" - } - }, - "createTime": "2019-05-31T21:23:12.402Z" - }, - { - "quote": "(206) 555-0123", - "infoType": { - "name": "PHONE_NUMBER" - }, - "likelihood": "LIKELY", - "location": { - "byteRange": { - "start": "67", - "end": "81" - }, - "codepointRange": { - "start": "67", - "end": "81" - } - }, - "createTime": "2019-05-31T21:23:12.402Z" - } - ] - } - } + { + "result": { + "findings": [ + { + "quote": "footfungusinbellybutton@gmail.com", + "infoType": { + "name": "EMAIL_ADDRESS" + }, + "likelihood": "LIKELY", + "location": { + "byteRange": { + "start": "13", + "end": "46" + }, + "codepointRange": { + "start": "13", + "end": "46" + } + }, + "createTime": "2019-05-31T21:23:12.402Z" + }, + { + "quote": "(206) 555-0123", + "infoType": { + "name": "PHONE_NUMBER" + }, + "likelihood": "LIKELY", + "location": { + "byteRange": { + "start": "67", + "end": "81" + }, + "codepointRange": { + "start": "67", + "end": "81" + } + }, + "createTime": "2019-05-31T21:23:12.402Z" + } + ] + } + } has_pii = perspective.contains_pii(dlp_response) self.assertTrue(has_pii) @@ -58,8 +63,7 @@ def test_contains_pii_false(self): "result": {} } has_pii = perspective.contains_pii(dlp_response) - self.assertFalse(has_pii[0]) - self.assertIsNone(has_pii[1]) + self.assertEqual(has_pii, (False, None)) def test_contains_toxicity_true(self): perspective_response = \ @@ -120,7 +124,7 @@ def test_contains_toxicity_true(self): "detectedLanguages": [ "en" ] - } + } is_toxic = perspective.contains_toxicity(perspective_response) self.assertTrue(is_toxic) @@ -184,9 +188,61 @@ def test_contains_toxicity_false(self): "en" ] } - is_toxic = perspective.contains_toxicity(perspective_response) + is_toxic = perspective.contains_insult(perspective_response) self.assertFalse(is_toxic) + def test_contains_toxicity_false(self): + perspective_response = \ + {'attributeScores': {'TOXICITY': {'spanScores': [{'begin': 0, 'end': 25, 'score': {'value': 0.9312127, 'type': 'PROBABILITY'}}], 'summaryScore': {'value': 0.9312127, 'type': 'PROBABILITY'}}, 'THREAT': {'spanScores': [{'begin': 0, 'end': 25, 'score': {'value': 0.15875438, 'type': 'PROBABILITY'}}], 'summaryScore': {'value': 0.15875438, 'type': 'PROBABILITY'}}, 'INSULT': {'spanScores': [{'begin': 0, 'end': 25, 'score': {'value': 0.93682694, 'type': 'PROBABILITY'}}], 'summaryScore': {'value': 0.93682694, 'type': 'PROBABILITY'}}}, 'languages': ['en'], 'detectedLanguages': ['en']} + is_insult = perspective.contains_toxicity(perspective_response) + self.assertTrue(is_insult) + + def contains_threat(perspective_response): + perspective_response = \ + {'attributeScores': {'INSULT': {'spanScores': [{'begin': 0, 'end': 21, 'score': {'value': 0.55873775, 'type': 'PROBABILITY'}}], 'summaryScore': {'value': 0.55873775, 'type': 'PROBABILITY'}}, 'TOXICITY': {'spanScores': [{'begin': 0, 'end': 21, 'score': {'value': 0.9759337, 'type': 'PROBABILITY'}}], 'summaryScore': {'value': 0.9759337, 'type': 'PROBABILITY'}}, 'THREAT': {'spanScores': [{'begin': 0, 'end': 21, 'score': {'value': 0.9980843, 'type': 'PROBABILITY'}}], 'summaryScore': {'value': 0.9980843, 'type': 'PROBABILITY'}}}, 'languages': ['en'], 'detectedLanguages': ['en']} + is_threat = perspective.contains_toxicity(perspective_response) + self.assertTrue(is_threat) + def test_get_wikipage(self): + wiki_response = \ + u"""{{talkheader|wp=yes|WT:NYC|WT:WPNYC}} +{{WPBS|1= +{{WikiProject Cities|class=project|importance=na}} +{{WikiProject New York City|class=project|importance=na}} +{{WikiProject New York|class=project|importance=na}} +{{WikiProject United States|class=project|importance=na}} +}} +{{Wikipedia:Wikipedia Signpost/WikiProject used|link=Wikipedia:Wikipedia Signpost/2012-12-31/WikiProject report|writer= [[User:Mabeenot|Mabeenot]]| ||day =31|month=December|year=2012}} +{{auto archiving notice|bot=MiszaBot II|botlink=User:MiszaBot II|age=60}}{{User:MiszaBot/config +|archiveheader = {{talkarchivenav}} +|maxarchivesize = 100K +|counter = 7 +|minthreadsleft = 5 +|minthreadstoarchive = 1 +|algo = old(60d) +|archive = Wikipedia talk:WikiProject New York City/Archive %(counter)d +}}{{User:HBC Archive Indexerbot/OptIn|target=Wikipedia talk:WikiProject New York City/Archive index|mask=Wikipedia talk:WikiProject New York City/Archive <#>|leading_zeros=0|indexhere=no}} + +{{TOC right}} + +== Help with a park article? == + +Hi! I didn't know if anyone was willing to work on a park article or not - a student of mine created the article on [[St. James Park (Bronx)]]. The class ends this week and I'm not entirely sure if they will be back on to edit it, but at the present it lacks information and sourcing to establish how it's notable. I'm going to try to do as much as I can for it, but I'm admittedly kind of swamped with other classes so I wanted to see if anyone would be interested in this. 15:36, 7 May 2019 (UTC) + +== Wikipedia:Naming conventions (US stations)/NYC Subway RfC == + +Just so everyone who would come here knows, there is an ongoing RfC at [[Wikipedia:Naming conventions (US stations)/NYC Subway RfC]] that WP:NYC might be interested in. {{sbb}} -- [[User:I dream of horses|I dream of horses]] {{small| If you reply here, please [[WP:ECHO|ping me]] by adding {{U|I dream of horses}} to your message }} {{small|([[User talk:I dream of horses|talk to me]]) ([[Special:Contributions/I dream of horses|My edits]])}} @ 05:11, 12 June 2019 (UTC)""" + clean_text = \ + u"""Help with a park article? + +Hi! I didn't know if anyone was willing to work on a park article or not - a student of mine created the article on St. James Park (Bronx). The class ends this week and I'm not entirely sure if they will be back on to edit it, but at the present it lacks information and sourcing to establish how it's notable. I'm going to try to do as much as I can for it, but I'm admittedly kind of swamped with other classes so I wanted to see if anyone would be interested in this. + + Wikipedia:Naming conventions (US stations)/NYC Subway RfC + +Just so everyone who would come here knows, there is an ongoing RfC at Wikipedia:Naming conventions (US stations)/NYC Subway RfC that WP:NYC might be interested in.    @ """ + text = perspective.wiki_clean(wiki_response) + self.assertEqual(text.strip(), clean_text.strip()) + + class Test_BigQuery(unittest.TestCase): def test_use_query(self): @@ -203,5 +259,5 @@ def test_use_query(self): self.assertEqual(len(rows), len(fake_response_comments)) -if __name__ == "__main__": - unittest.main() +if __name__ == '__main__': + unittest.main() From 5e5b2f49e5da1f3fc0486c7906b2af55267e627d Mon Sep 17 00:00:00 2001 From: Tamajong Nchukwi Date: Tue, 16 Jul 2019 17:45:07 -0400 Subject: [PATCH 5/5] updated readme with instructions for running distibuted system --- antidox/README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/antidox/README.md b/antidox/README.md index d113c8fb..edb40c3b 100644 --- a/antidox/README.md +++ b/antidox/README.md @@ -23,10 +23,13 @@ will be tested in wikipedia chat rooms as a staring point. bazel run :perspective --input_file=$PWD/example.csv --api_key=$PWD/api_key.json ``` - +5. To run code on a distributed system: + ```shell + --setup_file ,/setup.py + ``` Run the given model that test the comment from the csv file for toxicity and personally identifiable information. -5. Run unittest to ensure the functions contains_toxicity(), and contains_pii(), are working properly. +6. Run unittest to ensure the functions contains_toxicity(), and contains_pii(), are working properly. ```shell bazel test :perspective_test --test_output=all ```