Skip to content

Commit

Permalink
#180 first commit using bs4 to grab the role and period range for an …
Browse files Browse the repository at this point in the history
…actor
  • Loading branch information
YanLiang1102 committed Jun 29, 2017
1 parent 60a8378 commit 78cbe0d
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@
" except Exception as e:\n",
" noFindList.append(e)\n",
" #print(e)\n",
" return noFindList\n",
" "
]
},
Expand All @@ -279,7 +280,7 @@
"#then dump the data to pickle\n",
"try:\n",
" with open(\"noFindWord\", 'wb') as f:\n",
" pickle.dump(nofind, f, pickle.HIGHEST_PROTOCOL)\n",
" pickle.dump(notfind, f, pickle.HIGHEST_PROTOCOL)\n",
"except:\n",
" print(\"failed to save the result to disk\")\n",
" pass\n"
Expand Down
Empty file.
185 changes: 184 additions & 1 deletion otherHelperCode/english_to_arabic_dictionary/hack_wiki.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#this will need the user to pass in an english name and come back with an arabic name\n",
Expand Down Expand Up @@ -136,6 +138,187 @@
"hack_wiki(\"Mohammad_Najibullah\")"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n",
"page=requests.get(base_url)\n",
"soup=BeautifulSoup(page.content,\"lxml\")\n",
"lists=soup.select(\".infobox tr th a\")"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['President of Afghanistan']\n",
"[\"General Secretary of the Central Committee of the People's Democratic Party of Afghanistan\"]\n",
"['Director of the State Intelligence Agency']\n",
"['Alma mater']\n"
]
}
],
"source": [
"for item in lists:\n",
" print(item.contents)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"30 September 1987– 16 April 1992\n",
"4 May 1986– 16 April 1992\n",
"11 January 1980– 21 November 1985\n"
]
}
],
"source": [
"base_url=\"https://en.wikipedia.org/wiki/\"+\"Mohammad_Najibullah\"\n",
"page=requests.get(base_url)\n",
"soup=BeautifulSoup(page.content,\"lxml\")\n",
"#lists=soup.select(\".infobox tr th a\")\n",
"lists=soup.find(\"table\",{\"class\":\"infobox\"}).find_all(\"tr\")\n",
"for item in lists:\n",
" if(item.find(\"td\") is not None):\n",
" if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}) is not None):\n",
" if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent is not None):\n",
" #if(item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).find(\"td\") is not None):\n",
" tempstr=item.find(\"td\").find(\"span\",{\"class\":\"nowrap\"}).parent.contents\n",
" strstr=str(tempstr)\n",
" if(\"display:none\" not in strstr):\n",
" print(strstr.split('\\\\n')[1][:-2].replace(\"\\\\xa0\",''))\n",
" \n",
"#$('.infobox tr th a').closest(\"tr\").next(\"tr\").find(\".nowrap\").closest(\"td\").each(function(){console.log($(this).text())})"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<tr>\n",
" <th colspan=\"2\" style=\"text-align:center;font-size:125%;font-weight:bold;font-size: 130%;\"><span class=\"fn\">Dr Najibullah Ahmadzai</span></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center\"><a class=\"image\" href=\"/wiki/File:Najib.jpg\"><img alt=\"Najib.jpg\" data-file-height=\"351\" data-file-width=\"283\" height=\"273\" src=\"//upload.wikimedia.org/wikipedia/en/thumb/4/4c/Najib.jpg/220px-Najib.jpg\" srcset=\"//upload.wikimedia.org/wikipedia/en/4/4c/Najib.jpg 1.5x\" width=\"220\"/></a></td>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center\"></td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/President_of_Afghanistan\" title=\"President of Afghanistan\">President of Afghanistan</a></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n",
" 30 September 1987 – 16 April 1992</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Prime Minister</span></th>\n",
" <td><a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a><br/>\n",
" <a href=\"/wiki/Mohammad_Hasan_Sharq\" title=\"Mohammad Hasan Sharq\">Mohammad Hasan Sharq</a><br/>\n",
" <a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a><br/>\n",
" <a href=\"/wiki/Fazal_Haq_Khaliqyar\" title=\"Fazal Haq Khaliqyar\">Fazal Haq Khaliqyar</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n",
" <td><a href=\"/wiki/Haji_Mohammad_Chamkani\" title=\"Haji Mohammad Chamkani\">Haji Mohammad Chamkani</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n",
" <td><a href=\"/wiki/Abdul_Rahim_Hatif\" title=\"Abdul Rahim Hatif\">Abdul Rahim Hatif</a> <small>(acting)</small></td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/People%27s_Democratic_Party_of_Afghanistan\" title=\"People's Democratic Party of Afghanistan\">General Secretary of the Central Committee of the People's Democratic Party of Afghanistan</a></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n",
" 4 May 1986 – 16 April 1992</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n",
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n",
" <td>Position abolished</td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\"><a href=\"/wiki/KHAD\" title=\"KHAD\">Director of the State Intelligence Agency</a></th>\n",
" </tr>, <tr>\n",
" <td colspan=\"2\" style=\"text-align:center;border-bottom:none\"><span class=\"nowrap\"><b>In office</b></span><br/>\n",
" 11 January 1980 – 21 November 1985</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\">President</th>\n",
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Prime Minister</span></th>\n",
" <td><a href=\"/wiki/Babrak_Karmal\" title=\"Babrak Karmal\">Babrak Karmal</a><br/>\n",
" <a href=\"/wiki/Sultan_Ali_Keshtmand\" title=\"Sultan Ali Keshtmand\">Sultan Ali Keshtmand</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Preceded by</span></th>\n",
" <td><a href=\"/wiki/Assadullah_Sarwari\" title=\"Assadullah Sarwari\">Assadullah Sarwari</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\" style=\"text-align:left\"><span class=\"nowrap\">Succeeded by</span></th>\n",
" <td><a href=\"/wiki/Ghulam_Faruq_Yaqubi\" title=\"Ghulam Faruq Yaqubi\">Ghulam Faruq Yaqubi</a></td>\n",
" </tr>, <tr style=\"display:none\">\n",
" <td colspan=\"2\"></td>\n",
" </tr>, <tr>\n",
" <th colspan=\"2\" style=\"text-align:center;background:lavender\">Personal details</th>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Born</th>\n",
" <td>February 1947<br/>\n",
" <a class=\"mw-redirect\" href=\"/wiki/Paktia\" title=\"Paktia\">Paktia</a>, <a href=\"/wiki/Kingdom_of_Afghanistan\" title=\"Kingdom of Afghanistan\">Afghanistan</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Died</th>\n",
" <td><span class=\"nowrap\">28 September 1996<span style=\"display:none\">(<span class=\"dday deathdate\">1996-09-28</span>)</span> (aged 49)</span><br/>\n",
" <a href=\"/wiki/Kabul\" title=\"Kabul\">Kabul</a>, <a href=\"/wiki/Islamic_Emirate_of_Afghanistan\" title=\"Islamic Emirate of Afghanistan\">Afghanistan</a></td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Political party</th>\n",
" <td><a href=\"/wiki/People%27s_Democratic_Party_of_Afghanistan\" title=\"People's Democratic Party of Afghanistan\">People's Democratic Party of Afghanistan</a><br/>\n",
" (<a href=\"/wiki/Parcham\" title=\"Parcham\">Parcham</a>)</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Spouse(s)</th>\n",
" <td>Dr. Fatana Najib</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\">Children</th>\n",
" <td>three daughters</td>\n",
" </tr>, <tr>\n",
" <th scope=\"row\"><a href=\"/wiki/Alma_mater\" title=\"Alma mater\">Alma mater</a></th>\n",
" <td><a href=\"/wiki/Kabul_University\" title=\"Kabul University\">Kabul University</a></td>\n",
" </tr>]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lists"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n"
]
}
],
"source": [
"print(lists)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Empty file.

0 comments on commit 78cbe0d

Please sign in to comment.