From 3afc3cc3a65756e8e11d57a90e01507c253861e1 Mon Sep 17 00:00:00 2001 From: yanliang Date: Sat, 1 Jul 2017 01:13:41 -0500 Subject: [PATCH] #180 make the selenium stuff woring out of 30 we get 16 actors, we can get much more --- .../hack_wiki_asus-checkpoint.ipynb | 913 ++++++++++++++++++ .../geckodriver.log | 162 ++++ .../hack_wiki.ipynb | 25 +- .../hack_wiki_asus.ipynb | 913 ++++++++++++++++++ 4 files changed, 2007 insertions(+), 6 deletions(-) create mode 100644 otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki_asus-checkpoint.ipynb create mode 100644 otherHelperCode/english_to_arabic_dictionary/hack_wiki_asus.ipynb diff --git a/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki_asus-checkpoint.ipynb b/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki_asus-checkpoint.ipynb new file mode 100644 index 0000000..7e35ed2 --- /dev/null +++ b/otherHelperCode/english_to_arabic_dictionary/.ipynb_checkpoints/hack_wiki_asus-checkpoint.ipynb @@ -0,0 +1,913 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import json\n", + "import pickle\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import time\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.common.exceptions import TimeoutException\n", + "from selenium.webdriver.firefox.firefox_binary import FirefoxBinary\n", + "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "binary = FirefoxBinary(r'/usr/bin/firefox')\n", + "caps = DesiredCapabilities.FIREFOX.copy()\n", + "#Set ‘marionette’ browser to True\n", + "caps['marionette'] = True\n", + "#Launch the Firefox instance by specifying the geckodriver executable path\n", + "driver = webdriver.Firefox(firefox_binary=binary,capabilities=caps)\n", + "driver.wait = WebDriverWait(driver, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 242, + "metadata": {}, + "outputs": [], + "source": [ + "def formatOriginalNameToWikiName(originalname):\n", + " \"\"\"\n", + " return goodname if we return a better format from wiki\n", + " if not we just return empty string\n", + " we need this function,since if not formated yet, most of the time when you search wiki with the bad \n", + " name in the url it will return nothing.\n", + " \"\"\"\n", + " wikipedia.set_lang(\"en\")\n", + " allWikiResults=wikipedia.search(originalname)\n", + " if(len(allWikiResults)==0):\n", + " return \"\"\n", + " else:\n", + " return str(wikipedia.search(originalname)[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 294, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def MakeSeleniumToSearchWithOriginalName(originalname):\n", + " wikiname=formatOriginalNameToWikiName(originalname)\n", + " nametosearch=originalname if(wikiname==\"\") else wikiname\n", + " driver.get(\"https://en.wikipedia.org/wiki/\"+str(nametosearch))\n", + " result={}\n", + " result[\"findresult\"]={}\n", + " result[\"nofind\"]={}\n", + " try:\n", + " elem = driver.find_element_by_css_selector(\".interwiki-ar a\")\n", + " \"\"\"\n", + " these two lines of code needs to run before elem.click(), since it will goto\n", + " another page never find it any more.\n", + " \"\"\"\n", + " tempdic={}\n", + " #print(\"me\"+str(elem.get_attribute(\"href\")))\n", + " tempdic[\"arurl\"]=str(elem.get_attribute(\"href\"))\n", + " elem.click()\n", + " tempdic[\"originalname\"]=originalname\n", + " tempdic[\"wikiname\"]=wikiname\n", + " firstheading=driver.find_element_by_id(\"firstHeading\")\n", + " #arabic is from left to right that why u need to get the first one that returns.\n", + " tempdic[\"arname\"]=firstheading.text.split(\"\\n\")[0]\n", + " #print(tempdic[\"arname\"])\n", + " result[\"findresult\"]=tempdic\n", + " except Exception as e:\n", + " #print(e)\n", + " tempno={}\n", + " tempno[\"originalname\"]=originalname\n", + " result[\"nofind\"]=tempno\n", + " pass\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 253, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "hey=MakeSeleniumToSearchWithOriginalName(\"Mohammed Zahir Shah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 255, + "metadata": {}, + "outputs": [], + "source": [ + "nametosearch=\"Mohammed Zahir Shah\"\n", + "hey=driver.get(\"https://en.wikipedia.org/wiki/\"+str(nametosearch))" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "metadata": {}, + "outputs": [], + "source": [ + "elem=driver.find_element_by_css_selector(\".interwiki-ar a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "elem.click()" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "firstheading=driver.find_element_by_id(\"firstHeading\")" + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'محمد ظاهر شاه'" + ] + }, + "execution_count": 262, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "firstheading.text.split(\"\\n\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def clean_line(line):\n", + " # Take out extra space, underscores, comments, etc.\n", + " cleaned = re.sub(\"_* .+\", \"\", line).strip()\n", + " cleaned = re.sub(\"_$\", \"\", cleaned, flags=re.MULTILINE)\n", + " return cleaned\n", + "\n", + "def ingest_dictionary(dict_path):\n", + " \"\"\"\n", + " Read in the country (or other) actor dictionaries.\n", + " \"\"\"\n", + " with open(dict_path) as f:\n", + " country_file = f.read()\n", + " split_file = country_file.split(\"\\n\")\n", + " \n", + " dict_dict = []\n", + " key_name = \"\"\n", + " alt_names = [] \n", + " roles = []\n", + "\n", + " for line in split_file:\n", + " if not line:\n", + " pass\n", + " elif line[0] == \"#\":\n", + " pass\n", + " elif re.match(\"[A-Z]\", line[0]):\n", + " # handle the previous\n", + " entry = {\"actor_en\" : key_name,\n", + " \"alt_names_en\" : alt_names,\n", + " \"roles\" : roles}\n", + " dict_dict.append(entry)\n", + " # zero everything out\n", + " alt_names = []\n", + " roles = []\n", + " # make new key name\n", + " key_name = clean_line(line)\n", + " # check to see if the role is built in\n", + " if bool(re.search(\"\\[[A-Z]{3}\\]\", line)):\n", + " roles = re.findall(\"\\[(.+?)\\]\", line)\n", + " elif line[0] == \"+\":\n", + " cleaned = clean_line(line[1:])\n", + " alt_names.append(cleaned)\n", + " elif re.match(\"\\s\", line):\n", + " roles.append(line.strip())\n", + " return dict_dict \n", + "dp = \"./Phoenix.Countries.actors.txt\"\n", + "dict_dict = ingest_dictionary(dp)" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18390" + ] + }, + "execution_count": 233, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dict_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'actor_en': 'AHMAD_CHALABI',\n", + " 'alt_names_en': [],\n", + " 'roles': ['[IRQELI 620101-030901]',\n", + " '[IRQGOV 030901-030930]',\n", + " '[IRQGOV 031101-040630]',\n", + " '[IRQGOV 050601-060531]',\n", + " '[IRQELI]']}" + ] + }, + "execution_count": 237, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_dict[7777]" + ] + }, + { + "cell_type": "code", + "execution_count": 292, + "metadata": {}, + "outputs": [], + "source": [ + "def buildMultiLanguageActorDictionary(dict_dict):\n", + " finalResult={}\n", + " finalResult[\"goodones\"]=[]\n", + " finalResult[\"badones\"]=[]\n", + " for item in dict_dict:\n", + " originalname=item[\"actor_en\"]\n", + " if(originalname!=\"\"):\n", + " temp=MakeSeleniumToSearchWithOriginalName(originalname)\n", + " if(temp[\"findresult\"]):\n", + " finalResult[\"goodones\"].append(temp[\"findresult\"])\n", + " else:\n", + " finalResult[\"badones\"].append(temp[\"nofind\"])\n", + " return finalResult\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 296, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'badones': [{'originalname': 'NUR_MOHAMMAD_TARAKI'},\n", + " {'originalname': 'HAJI_MOHAMMAD_CHAMKANI'},\n", + " {'originalname': 'ABDUL_RAHIM_HATEF'},\n", + " {'originalname': 'MULLAH_MOHAMMAD_RABBANI'},\n", + " {'originalname': 'MAWLAWI_ABDUL_KABIR'},\n", + " {'originalname': 'MOHAMMAD_NUR_AHMAD_ETEMADI'},\n", + " {'originalname': 'MOHAMMAD_MUSA_SHAFIQ'},\n", + " {'originalname': 'SULTAN_ALI_KESHTMAND'},\n", + " {'originalname': 'MOHAMMAD_HASSAN_SHARQ'},\n", + " {'originalname': 'FAZAL_HAQ_KHALIQYAR'},\n", + " {'originalname': 'ABDUL_SABUR_FARID_KUHESTANI'},\n", + " {'originalname': 'ARSALA_RAHMANI'},\n", + " {'originalname': 'AHMAD_SHAH_AHMADZAI'},\n", + " {'originalname': 'AHMED_ZIA_MASSOUD'},\n", + " {'originalname': 'ABDUL_RAHIM_GHAFOORZAI'}],\n", + " 'goodones': [{'arname': 'أفغانستان',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%A3%D9%81%D8%BA%D8%A7%D9%86%D8%B3%D8%AA%D8%A7%D9%86',\n", + " 'originalname': 'AFGHANISTAN',\n", + " 'wikiname': 'Afghanistan'},\n", + " {'arname': 'قوات الأمن الوطنية الأفغانية',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%82%D9%88%D8%A7%D8%AA_%D8%A7%D9%84%D8%A3%D9%85%D9%86_%D8%A7%D9%84%D9%88%D8%B7%D9%86%D9%8A%D8%A9_%D8%A7%D9%84%D8%A3%D9%81%D8%BA%D8%A7%D9%86%D9%8A%D8%A9',\n", + " 'originalname': 'AFGHAN_NATIONAL_SECURITY_FORCES',\n", + " 'wikiname': 'Afghan National Security Forces'},\n", + " {'arname': 'محمد ظاهر شاه',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D8%B8%D8%A7%D9%87%D8%B1_%D8%B4%D8%A7%D9%87',\n", + " 'originalname': 'MOHAMMAD_ZAHIR_SHAH',\n", + " 'wikiname': 'Mohammed Zahir Shah'},\n", + " {'arname': 'عبد القادر',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B9%D8%A8%D8%AF_%D8%A7%D9%84%D9%82%D8%A7%D8%AF%D8%B1',\n", + " 'originalname': 'ABDUL_QADIR',\n", + " 'wikiname': 'Abdul Qadir'},\n", + " {'arname': 'حفيظ الله أمين',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%AD%D9%81%D9%8A%D8%B8_%D8%A7%D9%84%D9%84%D9%87_%D8%A3%D9%85%D9%8A%D9%86',\n", + " 'originalname': 'HAFIZULLAH_AMIN',\n", + " 'wikiname': 'Hafizullah Amin'},\n", + " {'arname': 'بابراك كرمال',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%A8%D8%A7%D8%A8%D8%B1%D8%A7%D9%83_%D9%83%D8%B1%D9%85%D8%A7%D9%84',\n", + " 'originalname': 'BABRAK_KARMAL',\n", + " 'wikiname': 'Babrak Karmal'},\n", + " {'arname': 'محمد نجيب الله',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D9%86%D8%AC%D9%8A%D8%A8_%D8%A7%D9%84%D9%84%D9%87',\n", + " 'originalname': 'MOHAMMAD_NAJIBULLAH',\n", + " 'wikiname': 'Mohammad Najibullah'},\n", + " {'arname': 'محمد داود خان',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D8%AF%D8%A7%D9%88%D8%AF_%D8%AE%D8%A7%D9%86',\n", + " 'originalname': 'SARDAR_MOHAMMAD_DAUD_KHAN',\n", + " 'wikiname': 'Mohammed Daoud Khan'},\n", + " {'arname': 'صبغت الله مجددي',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B5%D8%A8%D8%BA%D8%AA_%D8%A7%D9%84%D9%84%D9%87_%D9%85%D8%AC%D8%AF%D8%AF%D9%8A',\n", + " 'originalname': 'SIBGHATULLAH_MOJADEDI',\n", + " 'wikiname': 'Sibghatullah Mojaddedi'},\n", + " {'arname': 'برهان الدين رباني',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%A8%D8%B1%D9%87%D8%A7%D9%86_%D8%A7%D9%84%D8%AF%D9%8A%D9%86_%D8%B1%D8%A8%D8%A7%D9%86%D9%8A',\n", + " 'originalname': 'BURHANUDDIN_RABBANI',\n", + " 'wikiname': 'Burhanuddin Rabbani'},\n", + " {'arname': 'حامد كرزاي',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%AD%D8%A7%D9%85%D8%AF_%D9%83%D8%B1%D8%B2%D8%A7%D9%8A',\n", + " 'originalname': 'HAMID_KARZAI',\n", + " 'wikiname': 'Hamid Karzai'},\n", + " {'arname': 'عبد الظاهر',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B9%D8%A8%D8%AF_%D8%A7%D9%84%D8%B8%D8%A7%D9%87%D8%B1',\n", + " 'originalname': 'ABDUL_ZAHIR',\n", + " 'wikiname': 'Abdul Zahir'},\n", + " {'arname': 'غلبدين حكمتيار',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%BA%D9%84%D8%A8%D8%AF%D9%8A%D9%86_%D8%AD%D9%83%D9%85%D8%AA%D9%8A%D8%A7%D8%B1',\n", + " 'originalname': 'GULBUDDIN_HEKMATYAR',\n", + " 'wikiname': 'Gulbuddin Hekmatyar'},\n", + " {'arname': 'طارق معروفي',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B7%D8%A7%D8%B1%D9%82_%D9%85%D8%B9%D8%B1%D9%88%D9%81%D9%8A',\n", + " 'originalname': 'AHMED_SHAH_MASSOUD',\n", + " 'wikiname': 'Tarek Maaroufi'}]}" + ] + }, + "execution_count": 296, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buildMultiLanguageActorDictionary(dict_dict[0:30])" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Mohammed Zahir Shah',\n", + " 'Mohammadzai',\n", + " '1988 in Afghanistan',\n", + " 'Sardar Shah Wali Khan',\n", + " 'Bagrami District',\n", + " 'Qiamuddin Khadim',\n", + " 'Judiciary of Afghanistan',\n", + " 'Laili Helms',\n", + " 'United National Front (Afghanistan)',\n", + " 'Barakzai dynasty']" + ] + }, + "execution_count": 252, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"MOHAMMAD_ZAHIR_SHAH\")" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [], + "source": [ + "test=len(wikipedia.search(\"obama\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Barack Obama'" + ] + }, + "execution_count": 212, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"obama\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [], + "source": [ + "MakeSeleniumToSearchWithWikiFormattedName(\"Mohammad Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "driver.get(\"https://en.wikipedia.org/wiki/\"+\"Mohammad Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 227, + "metadata": {}, + "outputs": [], + "source": [ + "elem = driver.find_element_by_css_selector(\".interwiki-ar a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D9%86%D8%AC%D9%8A%D8%A8_%D8%A7%D9%84%D9%84%D9%87'" + ] + }, + "execution_count": 228, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "elem.get_attribute(\"href\")" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [], + "source": [ + "elem.click()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [], + "source": [ + "test=driver.find_element_by_id(\"firstHeading\")\n", + "#arabic is from left to right that why u need to get the first one that returns.\n", + "test.text.split(\"\\n\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'محمد نجيب الله'" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#arabic is from left to right that why u need to get the first one that returns.\n", + "test.text.split(\"\\n\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "driver.get('http://www.google.com')\n", + "\n", + "# search = driver.find_element_by_name('q')\n", + "# search.send_keys(\"selenium\")\n", + "# search.send_keys(Keys.RETURN) # hit return after you enter search text\n", + "from selenium.webdriver.common.keys import Keys\n", + "box = driver.wait.until(EC.presence_of_element_located(\n", + "(By.NAME, \"q\")))\n", + "button = driver.wait.until(EC.element_to_be_clickable(\n", + "(By.NAME, \"btnK\")))\n", + "box.send_keys(\"SIBGHATULLAH_MOJADEDI\")\n", + "box.send_keys(Keys.RETURN)\n", + "#time.sleep(5)\n", + "html=driver.page_source\n", + "soup=BeautifulSoup(html,\"html.parser\")\n", + "\n", + "#time.sleep(5) # sleep for 5 seconds so you can see the results\n", + "#driver.quit()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "hi=soup.find(\"div\",{\"class\":\"g\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "driver.quit()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "comments = soup.findAll('div',{'class':'g'}) " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlopen # Python 3\n", + "# from urllib2 import urlopen # Python 2\n", + "\n", + "url = \"https://en.wikipedia.org/wiki/\"+\"obama\"\n", + "soup = BeautifulSoup(urlopen(url), \"html.parser\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from googleapiclient.discovery import build\n", + "import pprint" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'https://en.wikipedia.org/wiki/Sibghatullah_Mojaddedi'\n" + ] + } + ], + "source": [ + "from googleapiclient.discovery import build\n", + "import pprint\n", + "\n", + "my_api_key = \"AIzaSyBBulleVoiDN9i8NITQqH_BUNGgyWX-nmA\"\n", + "my_cse_id = \"003461024781403571159:p4qrcenq1l0\"\n", + "\n", + "def google_search(search_term, api_key, cse_id, **kwargs):\n", + " service = build(\"customsearch\", \"v1\", developerKey=api_key)\n", + " res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()\n", + " #return res['spelling']['correctedQuery']\n", + "# return res['item']\n", + " return res['items']\n", + "results = google_search(\n", + " 'SIBGHATULLAH_MOJADEDI', my_api_key, my_cse_id, num=1)\n", + "for result in results:\n", + " pprint.pprint(result['formattedUrl'])\n", + "#print(results)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ألبرت أينشتاين\n" + ] + } + ], + "source": [ + "print(\"\\u0623\\u0644\\u0628\\u0631\\u062a \\u0623\\u064a\\u0646\\u0634\\u062a\\u0627\\u064a\\u0646\")" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "payload = {'action': 'query', 'titles': 'Alert Einstein','prop':'langlinks','format':'json'}\n", + "\n", + "r = requests.get(\"https://en.wikipedia.org/w/api.php\", data=payload)\n", + "soup=BeautifulSoup(r.content,\"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wikipedia ( or WIK-i-PEE-dee-ə) is a free online encyclopedia with the aim to allow anyone to edit articles. Wikipedia is the largest and most popular general reference work on the Internet and is ranked among the ten most popular websites. Wikipedia is owned by the nonprofit Wikimedia Foundation.\n", + "Wikipedia was launched on January 15, 2001, by Jimmy Wales and Larry Sanger. Sanger coined its name, a portmanteau of wiki and encyclopedia. There was only the English language version initially, but it quickly developed similar versions in other languages, which differ in content and in editing practices. With 5,433,361 articles, the English Wikipedia is the largest of the more than 290 Wikipedia encyclopedias. Overall, Wikipedia consists of more than 40 million articles in more than 250 different languages and, as of February 2014, it had 18 billion page views and nearly 500 million unique visitors each month.\n", + "As of March 2017, Wikipedia has about forty thousand high-quality articles known as Featured Articles and Good Articles that cover vital topics. In 2005, Nature published a peer review comparing 42 science articles from Encyclopædia Britannica and Wikipedia, and found that Wikipedia's level of accuracy approached Encyclopædia Britannica's.\n", + "Wikipedia has been criticized for allegedly exhibiting systemic bias, presenting a mixture of \"truths, half truths, and some falsehoods\", and, in controversial topics, being subject to manipulation and spin.\n" + ] + } + ], + "source": [ + "import wikipedia\n", + "print(wikipedia.summary(\"Wikipedia\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Barack Obama',\n", + " 'Barack Obama in comics',\n", + " 'Barack Obama Sr.',\n", + " 'Barack Obama: Der schwarze Kennedy',\n", + " 'List of things named after Barack Obama',\n", + " 'Inauguration of Barack Obama',\n", + " 'Bibliography of Barack Obama',\n", + " 'Barack Obama Presidential Center',\n", + " 'Timeline of the presidency of Barack Obama',\n", + " 'Barack Obama religion conspiracy theories']" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"Barack\")" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "wikipedia.set_lang(\"en\")\n", + "test=wikipedia.page(\"SIBGHATULLAH_MOJADEDI\").html()\n", + "soup=BeautifulSoup(test,'lxml')\n", + "hi=soup.find(\"li\",{\"class\":\"interwiki-ar\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['http://www.worldcat.org/identities/containsVIAFID/75918762',\n", + " 'http://www.worldcat.org/oclc/123336516',\n", + " 'http://www.worldcat.org/oclc/237144347',\n", + " 'http://aviation-safety.net/database/record.php?id=19920529-0',\n", + " 'http://hrw.org/reports/2005/afghanistan0605/4.htm#_Toc105552342',\n", + " 'http://id.loc.gov/authorities/names/no97021045',\n", + " 'http://www.afghan-bios.info/index.php?option=com_afghanbios&id=1085&task=view&total=2314&start=1266&Itemid=2',\n", + " 'http://www.aftabir.com/news/article/view/2016/02/09/1139108',\n", + " 'http://www.bbc.com/pashto/afghanistan/2016/02/160215_hh-27th-anniv-soviet-forces-defeat-afg',\n", + " 'http://www.khaama.com/mojadedi-announces-the-establishment-of-a-new-political-council-9607',\n", + " 'http://www.mojaddedi.org/biography-of-sibghatullah-al-mojaddedi.html',\n", + " 'http://www.pts.af/',\n", + " 'http://www.rferl.org/content/article/1066619.html',\n", + " 'http://www.washingtontimes.com/news/2010/sep/28/afghan-peace-council-draws-fire/',\n", + " 'http://www.zmong-afghanistan.com/profiles/sibghatullah.asp',\n", + " 'https://archive.org/stream/azu_acku_risalah_ds371_2_meem46_yaa1375#page/n1/mode/1up',\n", + " 'https://books.google.com.my/books?id=1xyh_DBV1bMC&pg=PA492&lpg=PA492&dq=sibghatullah+mujaddidi+born&source=bl&ots=0-bbq_LRo5&sig=evfzzrgRMTkeWS13W4QhfaHJwe4&hl=en&sa=X&redir_esc=y#v=onepage&q=sibghatullah%20mujaddidi%20born&f=false',\n", + " 'https://books.google.com/books?id=RUSNyMH1aFQC&lpg=PR4&pg=PA406#v=onepage&q&f=false',\n", + " 'https://books.google.com/books?id=_zWhhy8L0uQC&lpg=PP1&pg=PT15#v=onepage&q&f=false',\n", + " 'https://viaf.org/viaf/75918762',\n", + " 'https://web.archive.org/web/20110606152711/http://www.zmong-afghanistan.com/profiles/sibghatullah.asp']" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.page(\"SIBGHATULLAH_MOJADEDI\").references" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Sibghatullah Mojaddedi (Pashto: صبغت الله مجددی\\u200e\\u200e, born 21 April 1925) is a politician in Afghanistan, who served as Acting President after the fall of Mohammad Najibullah's government in April 1992. He is also the founder of the Afghan National Liberation Front, and served as the chairman of the 2003 loya jirga that approved Afghanistan's new constitution.\"" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ny = wikipedia.page(\"New York\")\n", + "#ny.title\n", + "#ny.url\n", + "#ny.links[0]\n", + "#wikipedia.set_lang(\"en\")\n", + "wikipedia.summary(\"SIBGHATULLAH_MOJADEDI\", sentences=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Mohammad Najibullah',\n", + " 'Mohammad Najatuallah Siddiqui',\n", + " 'Abdul Razzaq (Taliban governor)',\n", + " 'Vice President of Afghanistan',\n", + " 'Abdul Wahed Sorabi',\n", + " 'Habibia High School',\n", + " 'Ghazi High School',\n", + " 'Najib',\n", + " 'Najibullah Torwayana',\n", + " 'National Reconciliation']" + ] + }, + "execution_count": 187, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"MOHAMMAD_NAJIBULLAH\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import wikipedia\n", + "print wikipedia.summary(\"Wikipedia\")\n", + "# Wikipedia (/ˌwɪkɨˈpiːdiə/ or /ˌwɪkiˈpiːdiə/ WIK-i-PEE-dee-ə) is a collaboratively edited, multilingual, free Internet encyclopedia supported by the non-profit Wikimedia Foundation...\n", + "\n", + "wikipedia.search(\"Barack\")\n", + "# [u'Barak (given name)', u'Barack Obama', u'Barack (brandy)', u'Presidency of Barack Obama', u'Family of Barack Obama', u'First inauguration of Barack Obama', u'Barack Obama presidential campaign, 2008', u'Barack Obama, Sr.', u'Barack Obama citizenship conspiracy theories', u'Presidential transition of Barack Obama']\n", + "\n", + ">>> ny = wikipedia.page(\"New York\")\n", + ">>> ny.title\n", + "# u'New York'\n", + ">>> ny.url\n", + "# u'http://en.wikipedia.org/wiki/New_York'\n", + ">>> ny.content\n", + "# u'New York is a state in the Northeastern region of the United States. New York is the 27th-most exten'...\n", + ">>> ny.links[0]\n", + "# u'1790 United States Census'\n", + "\n", + ">>> wikipedia.set_lang(\"fr\")\n", + ">>> wikipedia.summary(\"Facebook\", sentences=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/otherHelperCode/english_to_arabic_dictionary/geckodriver.log b/otherHelperCode/english_to_arabic_dictionary/geckodriver.log index c1c2616..311da19 100644 --- a/otherHelperCode/english_to_arabic_dictionary/geckodriver.log +++ b/otherHelperCode/english_to_arabic_dictionary/geckodriver.log @@ -56,3 +56,165 @@ A coding exception was thrown and uncaught in a Task. Full message: TypeError: NetworkError when attempting to fetch resource. Full stack: ************************* +1498869008642 geckodriver INFO Listening on 127.0.0.1:57951 +1498869009726 geckodriver::marionette INFO Starting browser /usr/lib/firefox/firefox.sh with args ["-marionette"] +1498869015095 Marionette INFO Listening on port 44687 +1498869015292 Marionette WARN TLS certificate errors will be ignored for this session +************************* +A coding exception was thrown and uncaught in a Task. + +Full message: TypeError: NetworkError when attempting to fetch resource. +Full stack: +************************* +1498871340318 Marionette INFO New connections will no longer be accepted +1498871366157 geckodriver INFO Listening on 127.0.0.1:38745 +1498871367250 geckodriver::marionette INFO Starting browser /usr/lib/firefox/firefox.sh with args ["-marionette"] +1498871373099 Marionette INFO Listening on port 45133 +1498871373303 Marionette WARN TLS certificate errors will be ignored for this session +1498880849606 geckodriver INFO Listening on 127.0.0.1:60889 +1498880850724 geckodriver::marionette INFO Starting browser /usr/lib/firefox/firefox.sh with args ["-marionette"] +1498880867110 Marionette INFO Listening on port 36831 +1498880867481 Marionette WARN TLS certificate errors will be ignored for this session +************************* +A coding exception was thrown and uncaught in a Task. + +Full message: TypeError: NetworkError when attempting to fetch resource. +Full stack: +************************* +************************* +A coding exception was thrown in a Promise resolution callback. +See https://developer.mozilla.org/Mozilla/JavaScript_code_modules/Promise.jsm/Promise + +Full message: TypeError: this._containers is null +Full stack: getContainer@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/markup/markup.js:477:5 +showNode/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/markup/markup.js:1139:30 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + +************************* +console.error: + Message: Error: Connection closed, pending request to server1.conn0.child1/stylesheet1469, type getOriginalLocation failed + +Request stack: +request@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1269:14 +generateRequestMethods/ resource://devtools/shared/protocol.js:1426:14 +getOriginalLocation@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/styles.js:237:12 +getOriginalSourceStrings@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/models/rule.js:139:12 +updateSourceLink@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:276:7 +_create@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:124:5 +RuleEditor@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:77:3 +_createEditors@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/rules.js:1070:23 +_populate/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/rules.js:874:28 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + + Stack: + destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1212:23 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/stylesheets.js:109:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:851:9 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1214:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/styles.js:40:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:851:9 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1214:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/inspector.js:963:5 +destroyInspector/this._destroyingInspector<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/framework/toolbox.js:2202:13 +_run@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/task.js:311:39 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + +console.error: + Message: Error: Connection closed, pending request to server1.conn0.child1/stylesheet1469, type getOriginalLocation failed + +Request stack: +request@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1269:14 +generateRequestMethods/ resource://devtools/shared/protocol.js:1426:14 +getOriginalLocation@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/styles.js:237:12 +getOriginalSourceStrings@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/models/rule.js:139:12 +updateSourceLink@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:276:7 +_create@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:124:5 +RuleEditor@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:77:3 +_createEditors@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/rules.js:1070:23 +_populate/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/rules.js:874:28 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + + Stack: + destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1212:23 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/stylesheets.js:109:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:851:9 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1214:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/styles.js:40:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:851:9 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1214:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/inspector.js:963:5 +destroyInspector/this._destroyingInspector<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/framework/toolbox.js:2202:13 +_run@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/task.js:311:39 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + +console.error: + Message: Error: Connection closed, pending request to server1.conn0.child1/stylesheet1469, type getOriginalLocation failed + +Request stack: +request@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1269:14 +generateRequestMethods/ resource://devtools/shared/protocol.js:1426:14 +getOriginalLocation@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/styles.js:237:12 +getOriginalSourceStrings@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/models/rule.js:139:12 +updateSourceLink@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:276:7 +_create@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:124:5 +RuleEditor@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/views/rule-editor.js:77:3 +_createEditors@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/rules.js:1070:23 +_populate/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/inspector/rules/rules.js:874:28 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + + Stack: + destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1212:23 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/stylesheets.js:109:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:851:9 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1214:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/styles.js:40:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:851:9 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/protocol.js:1214:5 +destroy@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/fronts/inspector.js:963:5 +destroyInspector/this._destroyingInspector<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/framework/toolbox.js:2202:13 +_run@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/task.js:311:39 +process@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:922:23 +walkerLoop@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:806:7 +scheduleWalkerLoop/<@resource://gre/modules/Promise.jsm -> resource://gre/modules/Promise-backend.js:742:11 + +console.error: + TypeError: this._panel is null: set hidden@chrome://devtools/content/debugger/debugger-view.js:883:7 +clearView@chrome://devtools/content/debugger/debugger-view.js:904:5 +clearViews@chrome://devtools/content/debugger/views/filter-view.js:206:5 +clearSearch@chrome://devtools/content/debugger/views/filter-view.js:195:5 +handleTabNavigation@chrome://devtools/content/debugger/debugger-view.js:791:5 +_onWillNavigate@chrome://devtools/content/debugger/debugger-controller.js:362:5 +emit@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/event-emitter.js:194:13 +_setupRemoteListeners/this._onTabNavigated@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/framework/target.js:513:9 +eventSource/proto.emit@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/client/main.js:130:9 +onPacket@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/client/main.js:1017:7 +send/<@chrome://marionette/content/server.js -> resource://devtools/shared/transport/transport.js:570:13 +exports.makeInfallible/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/ThreadSafeDevToolsUtils.js:101:14 +exports.makeInfallible/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/ThreadSafeDevToolsUtils.js:101:14 + +TypeError: this._panel is null: set hidden@chrome://devtools/content/debugger/debugger-view.js:883:7 +clearView@chrome://devtools/content/debugger/debugger-view.js:904:5 +clearViews@chrome://devtools/content/debugger/views/filter-view.js:206:5 +clearSearch@chrome://devtools/content/debugger/views/filter-view.js:195:5 +handleTabNavigation@chrome://devtools/content/debugger/debugger-view.js:791:5 +_onWillNavigate@chrome://devtools/content/debugger/debugger-controller.js:362:5 +emit@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/event-emitter.js:194:13 +_setupRemoteListeners/this._onTabNavigated@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/client/framework/target.js:513:9 +eventSource/proto.emit@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/client/main.js:130:9 +onPacket@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/client/main.js:1017:7 +send/<@chrome://marionette/content/server.js -> resource://devtools/shared/transport/transport.js:570:13 +exports.makeInfallible/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/ThreadSafeDevToolsUtils.js:101:14 +exports.makeInfallible/<@resource://gre/modules/commonjs/toolkit/loader.js -> resource://devtools/shared/ThreadSafeDevToolsUtils.js:101:14 + diff --git a/otherHelperCode/english_to_arabic_dictionary/hack_wiki.ipynb b/otherHelperCode/english_to_arabic_dictionary/hack_wiki.ipynb index f12101b..44faa5e 100644 --- a/otherHelperCode/english_to_arabic_dictionary/hack_wiki.ipynb +++ b/otherHelperCode/english_to_arabic_dictionary/hack_wiki.ipynb @@ -3,10 +3,23 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, - "outputs": [], + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "module 'html5lib.treebuilders' has no attribute '_base'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#https://en.wikipedia.org/wiki\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mbs4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3/dist-packages/bs4/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mwarnings\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 30\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mbuilder\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mbuilder_registry\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mParserRejectedMarkup\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 31\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mdammit\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mUnicodeDammit\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m from .element import (\n", + "\u001b[0;32m/usr/lib/python3/dist-packages/bs4/builder/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 312\u001b[0m \u001b[0mregister_treebuilders_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_htmlparser\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 313\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 314\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0;34m.\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m_html5lib\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 315\u001b[0m \u001b[0mregister_treebuilders_from\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_html5lib\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3/dist-packages/bs4/builder/_html5lib.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m \u001b[0;32mclass\u001b[0m \u001b[0mTreeBuilderForHtml5lib\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml5lib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtreebuilders\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTreeBuilder\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 71\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msoup\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnamespaceHTMLElements\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: module 'html5lib.treebuilders' has no attribute '_base'" + ] + } + ], "source": [ "#https://en.wikipedia.org/wiki\n", "import requests\n", @@ -14,8 +27,7 @@ "import re\n", "import json\n", "import pickle\n", - "import datetime\n", - " " + "import datetime" ] }, { @@ -1081,6 +1093,7 @@ "cell_type": "code", "execution_count": 230, "metadata": { + "collapsed": true, "scrolled": true }, "outputs": [], diff --git a/otherHelperCode/english_to_arabic_dictionary/hack_wiki_asus.ipynb b/otherHelperCode/english_to_arabic_dictionary/hack_wiki_asus.ipynb new file mode 100644 index 0000000..7e35ed2 --- /dev/null +++ b/otherHelperCode/english_to_arabic_dictionary/hack_wiki_asus.ipynb @@ -0,0 +1,913 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import json\n", + "import pickle\n", + "import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import time\n", + "from selenium import webdriver\n", + "from selenium.webdriver.common.by import By\n", + "from selenium.webdriver.support.ui import WebDriverWait\n", + "from selenium.webdriver.support import expected_conditions as EC\n", + "from selenium.common.exceptions import TimeoutException\n", + "from selenium.webdriver.firefox.firefox_binary import FirefoxBinary\n", + "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "binary = FirefoxBinary(r'/usr/bin/firefox')\n", + "caps = DesiredCapabilities.FIREFOX.copy()\n", + "#Set ‘marionette’ browser to True\n", + "caps['marionette'] = True\n", + "#Launch the Firefox instance by specifying the geckodriver executable path\n", + "driver = webdriver.Firefox(firefox_binary=binary,capabilities=caps)\n", + "driver.wait = WebDriverWait(driver, 5)" + ] + }, + { + "cell_type": "code", + "execution_count": 242, + "metadata": {}, + "outputs": [], + "source": [ + "def formatOriginalNameToWikiName(originalname):\n", + " \"\"\"\n", + " return goodname if we return a better format from wiki\n", + " if not we just return empty string\n", + " we need this function,since if not formated yet, most of the time when you search wiki with the bad \n", + " name in the url it will return nothing.\n", + " \"\"\"\n", + " wikipedia.set_lang(\"en\")\n", + " allWikiResults=wikipedia.search(originalname)\n", + " if(len(allWikiResults)==0):\n", + " return \"\"\n", + " else:\n", + " return str(wikipedia.search(originalname)[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 294, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def MakeSeleniumToSearchWithOriginalName(originalname):\n", + " wikiname=formatOriginalNameToWikiName(originalname)\n", + " nametosearch=originalname if(wikiname==\"\") else wikiname\n", + " driver.get(\"https://en.wikipedia.org/wiki/\"+str(nametosearch))\n", + " result={}\n", + " result[\"findresult\"]={}\n", + " result[\"nofind\"]={}\n", + " try:\n", + " elem = driver.find_element_by_css_selector(\".interwiki-ar a\")\n", + " \"\"\"\n", + " these two lines of code needs to run before elem.click(), since it will goto\n", + " another page never find it any more.\n", + " \"\"\"\n", + " tempdic={}\n", + " #print(\"me\"+str(elem.get_attribute(\"href\")))\n", + " tempdic[\"arurl\"]=str(elem.get_attribute(\"href\"))\n", + " elem.click()\n", + " tempdic[\"originalname\"]=originalname\n", + " tempdic[\"wikiname\"]=wikiname\n", + " firstheading=driver.find_element_by_id(\"firstHeading\")\n", + " #arabic is from left to right that why u need to get the first one that returns.\n", + " tempdic[\"arname\"]=firstheading.text.split(\"\\n\")[0]\n", + " #print(tempdic[\"arname\"])\n", + " result[\"findresult\"]=tempdic\n", + " except Exception as e:\n", + " #print(e)\n", + " tempno={}\n", + " tempno[\"originalname\"]=originalname\n", + " result[\"nofind\"]=tempno\n", + " pass\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 253, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "hey=MakeSeleniumToSearchWithOriginalName(\"Mohammed Zahir Shah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 255, + "metadata": {}, + "outputs": [], + "source": [ + "nametosearch=\"Mohammed Zahir Shah\"\n", + "hey=driver.get(\"https://en.wikipedia.org/wiki/\"+str(nametosearch))" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "metadata": {}, + "outputs": [], + "source": [ + "elem=driver.find_element_by_css_selector(\".interwiki-ar a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "elem.click()" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "firstheading=driver.find_element_by_id(\"firstHeading\")" + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'محمد ظاهر شاه'" + ] + }, + "execution_count": 262, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "firstheading.text.split(\"\\n\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def clean_line(line):\n", + " # Take out extra space, underscores, comments, etc.\n", + " cleaned = re.sub(\"_* .+\", \"\", line).strip()\n", + " cleaned = re.sub(\"_$\", \"\", cleaned, flags=re.MULTILINE)\n", + " return cleaned\n", + "\n", + "def ingest_dictionary(dict_path):\n", + " \"\"\"\n", + " Read in the country (or other) actor dictionaries.\n", + " \"\"\"\n", + " with open(dict_path) as f:\n", + " country_file = f.read()\n", + " split_file = country_file.split(\"\\n\")\n", + " \n", + " dict_dict = []\n", + " key_name = \"\"\n", + " alt_names = [] \n", + " roles = []\n", + "\n", + " for line in split_file:\n", + " if not line:\n", + " pass\n", + " elif line[0] == \"#\":\n", + " pass\n", + " elif re.match(\"[A-Z]\", line[0]):\n", + " # handle the previous\n", + " entry = {\"actor_en\" : key_name,\n", + " \"alt_names_en\" : alt_names,\n", + " \"roles\" : roles}\n", + " dict_dict.append(entry)\n", + " # zero everything out\n", + " alt_names = []\n", + " roles = []\n", + " # make new key name\n", + " key_name = clean_line(line)\n", + " # check to see if the role is built in\n", + " if bool(re.search(\"\\[[A-Z]{3}\\]\", line)):\n", + " roles = re.findall(\"\\[(.+?)\\]\", line)\n", + " elif line[0] == \"+\":\n", + " cleaned = clean_line(line[1:])\n", + " alt_names.append(cleaned)\n", + " elif re.match(\"\\s\", line):\n", + " roles.append(line.strip())\n", + " return dict_dict \n", + "dp = \"./Phoenix.Countries.actors.txt\"\n", + "dict_dict = ingest_dictionary(dp)" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18390" + ] + }, + "execution_count": 233, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dict_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'actor_en': 'AHMAD_CHALABI',\n", + " 'alt_names_en': [],\n", + " 'roles': ['[IRQELI 620101-030901]',\n", + " '[IRQGOV 030901-030930]',\n", + " '[IRQGOV 031101-040630]',\n", + " '[IRQGOV 050601-060531]',\n", + " '[IRQELI]']}" + ] + }, + "execution_count": 237, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_dict[7777]" + ] + }, + { + "cell_type": "code", + "execution_count": 292, + "metadata": {}, + "outputs": [], + "source": [ + "def buildMultiLanguageActorDictionary(dict_dict):\n", + " finalResult={}\n", + " finalResult[\"goodones\"]=[]\n", + " finalResult[\"badones\"]=[]\n", + " for item in dict_dict:\n", + " originalname=item[\"actor_en\"]\n", + " if(originalname!=\"\"):\n", + " temp=MakeSeleniumToSearchWithOriginalName(originalname)\n", + " if(temp[\"findresult\"]):\n", + " finalResult[\"goodones\"].append(temp[\"findresult\"])\n", + " else:\n", + " finalResult[\"badones\"].append(temp[\"nofind\"])\n", + " return finalResult\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 296, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'badones': [{'originalname': 'NUR_MOHAMMAD_TARAKI'},\n", + " {'originalname': 'HAJI_MOHAMMAD_CHAMKANI'},\n", + " {'originalname': 'ABDUL_RAHIM_HATEF'},\n", + " {'originalname': 'MULLAH_MOHAMMAD_RABBANI'},\n", + " {'originalname': 'MAWLAWI_ABDUL_KABIR'},\n", + " {'originalname': 'MOHAMMAD_NUR_AHMAD_ETEMADI'},\n", + " {'originalname': 'MOHAMMAD_MUSA_SHAFIQ'},\n", + " {'originalname': 'SULTAN_ALI_KESHTMAND'},\n", + " {'originalname': 'MOHAMMAD_HASSAN_SHARQ'},\n", + " {'originalname': 'FAZAL_HAQ_KHALIQYAR'},\n", + " {'originalname': 'ABDUL_SABUR_FARID_KUHESTANI'},\n", + " {'originalname': 'ARSALA_RAHMANI'},\n", + " {'originalname': 'AHMAD_SHAH_AHMADZAI'},\n", + " {'originalname': 'AHMED_ZIA_MASSOUD'},\n", + " {'originalname': 'ABDUL_RAHIM_GHAFOORZAI'}],\n", + " 'goodones': [{'arname': 'أفغانستان',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%A3%D9%81%D8%BA%D8%A7%D9%86%D8%B3%D8%AA%D8%A7%D9%86',\n", + " 'originalname': 'AFGHANISTAN',\n", + " 'wikiname': 'Afghanistan'},\n", + " {'arname': 'قوات الأمن الوطنية الأفغانية',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%82%D9%88%D8%A7%D8%AA_%D8%A7%D9%84%D8%A3%D9%85%D9%86_%D8%A7%D9%84%D9%88%D8%B7%D9%86%D9%8A%D8%A9_%D8%A7%D9%84%D8%A3%D9%81%D8%BA%D8%A7%D9%86%D9%8A%D8%A9',\n", + " 'originalname': 'AFGHAN_NATIONAL_SECURITY_FORCES',\n", + " 'wikiname': 'Afghan National Security Forces'},\n", + " {'arname': 'محمد ظاهر شاه',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D8%B8%D8%A7%D9%87%D8%B1_%D8%B4%D8%A7%D9%87',\n", + " 'originalname': 'MOHAMMAD_ZAHIR_SHAH',\n", + " 'wikiname': 'Mohammed Zahir Shah'},\n", + " {'arname': 'عبد القادر',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B9%D8%A8%D8%AF_%D8%A7%D9%84%D9%82%D8%A7%D8%AF%D8%B1',\n", + " 'originalname': 'ABDUL_QADIR',\n", + " 'wikiname': 'Abdul Qadir'},\n", + " {'arname': 'حفيظ الله أمين',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%AD%D9%81%D9%8A%D8%B8_%D8%A7%D9%84%D9%84%D9%87_%D8%A3%D9%85%D9%8A%D9%86',\n", + " 'originalname': 'HAFIZULLAH_AMIN',\n", + " 'wikiname': 'Hafizullah Amin'},\n", + " {'arname': 'بابراك كرمال',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%A8%D8%A7%D8%A8%D8%B1%D8%A7%D9%83_%D9%83%D8%B1%D9%85%D8%A7%D9%84',\n", + " 'originalname': 'BABRAK_KARMAL',\n", + " 'wikiname': 'Babrak Karmal'},\n", + " {'arname': 'محمد نجيب الله',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D9%86%D8%AC%D9%8A%D8%A8_%D8%A7%D9%84%D9%84%D9%87',\n", + " 'originalname': 'MOHAMMAD_NAJIBULLAH',\n", + " 'wikiname': 'Mohammad Najibullah'},\n", + " {'arname': 'محمد داود خان',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D8%AF%D8%A7%D9%88%D8%AF_%D8%AE%D8%A7%D9%86',\n", + " 'originalname': 'SARDAR_MOHAMMAD_DAUD_KHAN',\n", + " 'wikiname': 'Mohammed Daoud Khan'},\n", + " {'arname': 'صبغت الله مجددي',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B5%D8%A8%D8%BA%D8%AA_%D8%A7%D9%84%D9%84%D9%87_%D9%85%D8%AC%D8%AF%D8%AF%D9%8A',\n", + " 'originalname': 'SIBGHATULLAH_MOJADEDI',\n", + " 'wikiname': 'Sibghatullah Mojaddedi'},\n", + " {'arname': 'برهان الدين رباني',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%A8%D8%B1%D9%87%D8%A7%D9%86_%D8%A7%D9%84%D8%AF%D9%8A%D9%86_%D8%B1%D8%A8%D8%A7%D9%86%D9%8A',\n", + " 'originalname': 'BURHANUDDIN_RABBANI',\n", + " 'wikiname': 'Burhanuddin Rabbani'},\n", + " {'arname': 'حامد كرزاي',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%AD%D8%A7%D9%85%D8%AF_%D9%83%D8%B1%D8%B2%D8%A7%D9%8A',\n", + " 'originalname': 'HAMID_KARZAI',\n", + " 'wikiname': 'Hamid Karzai'},\n", + " {'arname': 'عبد الظاهر',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B9%D8%A8%D8%AF_%D8%A7%D9%84%D8%B8%D8%A7%D9%87%D8%B1',\n", + " 'originalname': 'ABDUL_ZAHIR',\n", + " 'wikiname': 'Abdul Zahir'},\n", + " {'arname': 'غلبدين حكمتيار',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%BA%D9%84%D8%A8%D8%AF%D9%8A%D9%86_%D8%AD%D9%83%D9%85%D8%AA%D9%8A%D8%A7%D8%B1',\n", + " 'originalname': 'GULBUDDIN_HEKMATYAR',\n", + " 'wikiname': 'Gulbuddin Hekmatyar'},\n", + " {'arname': 'طارق معروفي',\n", + " 'arurl': 'https://ar.wikipedia.org/wiki/%D8%B7%D8%A7%D8%B1%D9%82_%D9%85%D8%B9%D8%B1%D9%88%D9%81%D9%8A',\n", + " 'originalname': 'AHMED_SHAH_MASSOUD',\n", + " 'wikiname': 'Tarek Maaroufi'}]}" + ] + }, + "execution_count": 296, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "buildMultiLanguageActorDictionary(dict_dict[0:30])" + ] + }, + { + "cell_type": "code", + "execution_count": 252, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Mohammed Zahir Shah',\n", + " 'Mohammadzai',\n", + " '1988 in Afghanistan',\n", + " 'Sardar Shah Wali Khan',\n", + " 'Bagrami District',\n", + " 'Qiamuddin Khadim',\n", + " 'Judiciary of Afghanistan',\n", + " 'Laili Helms',\n", + " 'United National Front (Afghanistan)',\n", + " 'Barakzai dynasty']" + ] + }, + "execution_count": 252, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"MOHAMMAD_ZAHIR_SHAH\")" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [], + "source": [ + "test=len(wikipedia.search(\"obama\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Barack Obama'" + ] + }, + "execution_count": 212, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"obama\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [], + "source": [ + "MakeSeleniumToSearchWithWikiFormattedName(\"Mohammad Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "driver.get(\"https://en.wikipedia.org/wiki/\"+\"Mohammad Najibullah\")" + ] + }, + { + "cell_type": "code", + "execution_count": 227, + "metadata": {}, + "outputs": [], + "source": [ + "elem = driver.find_element_by_css_selector(\".interwiki-ar a\")" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://ar.wikipedia.org/wiki/%D9%85%D8%AD%D9%85%D8%AF_%D9%86%D8%AC%D9%8A%D8%A8_%D8%A7%D9%84%D9%84%D9%87'" + ] + }, + "execution_count": 228, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "elem.get_attribute(\"href\")" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": {}, + "outputs": [], + "source": [ + "elem.click()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 191, + "metadata": {}, + "outputs": [], + "source": [ + "test=driver.find_element_by_id(\"firstHeading\")\n", + "#arabic is from left to right that why u need to get the first one that returns.\n", + "test.text.split(\"\\n\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'محمد نجيب الله'" + ] + }, + "execution_count": 192, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#arabic is from left to right that why u need to get the first one that returns.\n", + "test.text.split(\"\\n\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "driver.get('http://www.google.com')\n", + "\n", + "# search = driver.find_element_by_name('q')\n", + "# search.send_keys(\"selenium\")\n", + "# search.send_keys(Keys.RETURN) # hit return after you enter search text\n", + "from selenium.webdriver.common.keys import Keys\n", + "box = driver.wait.until(EC.presence_of_element_located(\n", + "(By.NAME, \"q\")))\n", + "button = driver.wait.until(EC.element_to_be_clickable(\n", + "(By.NAME, \"btnK\")))\n", + "box.send_keys(\"SIBGHATULLAH_MOJADEDI\")\n", + "box.send_keys(Keys.RETURN)\n", + "#time.sleep(5)\n", + "html=driver.page_source\n", + "soup=BeautifulSoup(html,\"html.parser\")\n", + "\n", + "#time.sleep(5) # sleep for 5 seconds so you can see the results\n", + "#driver.quit()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "hi=soup.find(\"div\",{\"class\":\"g\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "driver.quit()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "comments = soup.findAll('div',{'class':'g'}) " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlopen # Python 3\n", + "# from urllib2 import urlopen # Python 2\n", + "\n", + "url = \"https://en.wikipedia.org/wiki/\"+\"obama\"\n", + "soup = BeautifulSoup(urlopen(url), \"html.parser\")" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from googleapiclient.discovery import build\n", + "import pprint" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'https://en.wikipedia.org/wiki/Sibghatullah_Mojaddedi'\n" + ] + } + ], + "source": [ + "from googleapiclient.discovery import build\n", + "import pprint\n", + "\n", + "my_api_key = \"AIzaSyBBulleVoiDN9i8NITQqH_BUNGgyWX-nmA\"\n", + "my_cse_id = \"003461024781403571159:p4qrcenq1l0\"\n", + "\n", + "def google_search(search_term, api_key, cse_id, **kwargs):\n", + " service = build(\"customsearch\", \"v1\", developerKey=api_key)\n", + " res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()\n", + " #return res['spelling']['correctedQuery']\n", + "# return res['item']\n", + " return res['items']\n", + "results = google_search(\n", + " 'SIBGHATULLAH_MOJADEDI', my_api_key, my_cse_id, num=1)\n", + "for result in results:\n", + " pprint.pprint(result['formattedUrl'])\n", + "#print(results)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ألبرت أينشتاين\n" + ] + } + ], + "source": [ + "print(\"\\u0623\\u0644\\u0628\\u0631\\u062a \\u0623\\u064a\\u0646\\u0634\\u062a\\u0627\\u064a\\u0646\")" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "\n", + "payload = {'action': 'query', 'titles': 'Alert Einstein','prop':'langlinks','format':'json'}\n", + "\n", + "r = requests.get(\"https://en.wikipedia.org/w/api.php\", data=payload)\n", + "soup=BeautifulSoup(r.content,\"lxml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wikipedia ( or WIK-i-PEE-dee-ə) is a free online encyclopedia with the aim to allow anyone to edit articles. Wikipedia is the largest and most popular general reference work on the Internet and is ranked among the ten most popular websites. Wikipedia is owned by the nonprofit Wikimedia Foundation.\n", + "Wikipedia was launched on January 15, 2001, by Jimmy Wales and Larry Sanger. Sanger coined its name, a portmanteau of wiki and encyclopedia. There was only the English language version initially, but it quickly developed similar versions in other languages, which differ in content and in editing practices. With 5,433,361 articles, the English Wikipedia is the largest of the more than 290 Wikipedia encyclopedias. Overall, Wikipedia consists of more than 40 million articles in more than 250 different languages and, as of February 2014, it had 18 billion page views and nearly 500 million unique visitors each month.\n", + "As of March 2017, Wikipedia has about forty thousand high-quality articles known as Featured Articles and Good Articles that cover vital topics. In 2005, Nature published a peer review comparing 42 science articles from Encyclopædia Britannica and Wikipedia, and found that Wikipedia's level of accuracy approached Encyclopædia Britannica's.\n", + "Wikipedia has been criticized for allegedly exhibiting systemic bias, presenting a mixture of \"truths, half truths, and some falsehoods\", and, in controversial topics, being subject to manipulation and spin.\n" + ] + } + ], + "source": [ + "import wikipedia\n", + "print(wikipedia.summary(\"Wikipedia\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Barack Obama',\n", + " 'Barack Obama in comics',\n", + " 'Barack Obama Sr.',\n", + " 'Barack Obama: Der schwarze Kennedy',\n", + " 'List of things named after Barack Obama',\n", + " 'Inauguration of Barack Obama',\n", + " 'Bibliography of Barack Obama',\n", + " 'Barack Obama Presidential Center',\n", + " 'Timeline of the presidency of Barack Obama',\n", + " 'Barack Obama religion conspiracy theories']" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"Barack\")" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [], + "source": [ + "wikipedia.set_lang(\"en\")\n", + "test=wikipedia.page(\"SIBGHATULLAH_MOJADEDI\").html()\n", + "soup=BeautifulSoup(test,'lxml')\n", + "hi=soup.find(\"li\",{\"class\":\"interwiki-ar\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['http://www.worldcat.org/identities/containsVIAFID/75918762',\n", + " 'http://www.worldcat.org/oclc/123336516',\n", + " 'http://www.worldcat.org/oclc/237144347',\n", + " 'http://aviation-safety.net/database/record.php?id=19920529-0',\n", + " 'http://hrw.org/reports/2005/afghanistan0605/4.htm#_Toc105552342',\n", + " 'http://id.loc.gov/authorities/names/no97021045',\n", + " 'http://www.afghan-bios.info/index.php?option=com_afghanbios&id=1085&task=view&total=2314&start=1266&Itemid=2',\n", + " 'http://www.aftabir.com/news/article/view/2016/02/09/1139108',\n", + " 'http://www.bbc.com/pashto/afghanistan/2016/02/160215_hh-27th-anniv-soviet-forces-defeat-afg',\n", + " 'http://www.khaama.com/mojadedi-announces-the-establishment-of-a-new-political-council-9607',\n", + " 'http://www.mojaddedi.org/biography-of-sibghatullah-al-mojaddedi.html',\n", + " 'http://www.pts.af/',\n", + " 'http://www.rferl.org/content/article/1066619.html',\n", + " 'http://www.washingtontimes.com/news/2010/sep/28/afghan-peace-council-draws-fire/',\n", + " 'http://www.zmong-afghanistan.com/profiles/sibghatullah.asp',\n", + " 'https://archive.org/stream/azu_acku_risalah_ds371_2_meem46_yaa1375#page/n1/mode/1up',\n", + " 'https://books.google.com.my/books?id=1xyh_DBV1bMC&pg=PA492&lpg=PA492&dq=sibghatullah+mujaddidi+born&source=bl&ots=0-bbq_LRo5&sig=evfzzrgRMTkeWS13W4QhfaHJwe4&hl=en&sa=X&redir_esc=y#v=onepage&q=sibghatullah%20mujaddidi%20born&f=false',\n", + " 'https://books.google.com/books?id=RUSNyMH1aFQC&lpg=PR4&pg=PA406#v=onepage&q&f=false',\n", + " 'https://books.google.com/books?id=_zWhhy8L0uQC&lpg=PP1&pg=PT15#v=onepage&q&f=false',\n", + " 'https://viaf.org/viaf/75918762',\n", + " 'https://web.archive.org/web/20110606152711/http://www.zmong-afghanistan.com/profiles/sibghatullah.asp']" + ] + }, + "execution_count": 155, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.page(\"SIBGHATULLAH_MOJADEDI\").references" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Sibghatullah Mojaddedi (Pashto: صبغت الله مجددی\\u200e\\u200e, born 21 April 1925) is a politician in Afghanistan, who served as Acting President after the fall of Mohammad Najibullah's government in April 1992. He is also the founder of the Afghan National Liberation Front, and served as the chairman of the 2003 loya jirga that approved Afghanistan's new constitution.\"" + ] + }, + "execution_count": 156, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ny = wikipedia.page(\"New York\")\n", + "#ny.title\n", + "#ny.url\n", + "#ny.links[0]\n", + "#wikipedia.set_lang(\"en\")\n", + "wikipedia.summary(\"SIBGHATULLAH_MOJADEDI\", sentences=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Mohammad Najibullah',\n", + " 'Mohammad Najatuallah Siddiqui',\n", + " 'Abdul Razzaq (Taliban governor)',\n", + " 'Vice President of Afghanistan',\n", + " 'Abdul Wahed Sorabi',\n", + " 'Habibia High School',\n", + " 'Ghazi High School',\n", + " 'Najib',\n", + " 'Najibullah Torwayana',\n", + " 'National Reconciliation']" + ] + }, + "execution_count": 187, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wikipedia.search(\"MOHAMMAD_NAJIBULLAH\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import wikipedia\n", + "print wikipedia.summary(\"Wikipedia\")\n", + "# Wikipedia (/ˌwɪkɨˈpiːdiə/ or /ˌwɪkiˈpiːdiə/ WIK-i-PEE-dee-ə) is a collaboratively edited, multilingual, free Internet encyclopedia supported by the non-profit Wikimedia Foundation...\n", + "\n", + "wikipedia.search(\"Barack\")\n", + "# [u'Barak (given name)', u'Barack Obama', u'Barack (brandy)', u'Presidency of Barack Obama', u'Family of Barack Obama', u'First inauguration of Barack Obama', u'Barack Obama presidential campaign, 2008', u'Barack Obama, Sr.', u'Barack Obama citizenship conspiracy theories', u'Presidential transition of Barack Obama']\n", + "\n", + ">>> ny = wikipedia.page(\"New York\")\n", + ">>> ny.title\n", + "# u'New York'\n", + ">>> ny.url\n", + "# u'http://en.wikipedia.org/wiki/New_York'\n", + ">>> ny.content\n", + "# u'New York is a state in the Northeastern region of the United States. New York is the 27th-most exten'...\n", + ">>> ny.links[0]\n", + "# u'1790 United States Census'\n", + "\n", + ">>> wikipedia.set_lang(\"fr\")\n", + ">>> wikipedia.summary(\"Facebook\", sentences=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "env", + "language": "python", + "name": "env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}