-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit fd3eced
Showing
78 changed files
with
366,695 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
.vscode/ | ||
__pycache__/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "WOsmoXE7p9nm", | ||
"metadata": { | ||
"id": "WOsmoXE7p9nm" | ||
}, | ||
"source": [ | ||
"# Web Scraper file\n", | ||
"\n", | ||
"The data would be extracted from The Real Estate website: PropertyPro.NG\n", | ||
"\n", | ||
"I have researched the website and see some features that most houses/ listings have:\n", | ||
"\n", | ||
"We would first get the data for these locations - Ikeja, Ikoyi, Lekki, Victoria Island \n", | ||
"\n", | ||
"After that for each city we would get the following information\n", | ||
"* Title of the home/listing - which we can get info like\n", | ||
" * If the house is a duplex, apartment, land, flat\n", | ||
"___\n", | ||
"* Location of home : we can get info like\n", | ||
" * city home is located, neighborhood/area it is located\n", | ||
"___\n", | ||
"* Price of Home : It is a string and would be converted into integers\n", | ||
"___\n", | ||
"* If home is Newly Built (True/ False)\n", | ||
"* If home is Serviced (True/ False)\n", | ||
"* If home is Furnished (True/ False)\n", | ||
"___\n", | ||
"* No of Bedrooms (String convert to integer)\n", | ||
"___\n", | ||
"* No of Bathrooms (String convert to integer)\n", | ||
"___\n", | ||
"* No of Toilets (String convert to integer)\n", | ||
"___\n", | ||
"* No of Bathrooms (String convert to integer)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 40, | ||
"id": "lkuqHn6kp9nv", | ||
"metadata": { | ||
"id": "lkuqHn6kp9nv" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# !pip install bs4 requests --quiet" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "0Qy4A-1Yp9nx", | ||
"metadata": { | ||
"id": "0Qy4A-1Yp9nx" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"C:\\Users\\EMEH\\Anaconda3\\lib\\site-packages\\requests\\__init__.py:91: RequestsDependencyWarning: urllib3 (1.26.8) or chardet (3.0.4) doesn't match a supported version!\n", | ||
" RequestsDependencyWarning)\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Import Seperate Libraries\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"import requests\n", | ||
"import csv" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "zZkBJYC7p9ny", | ||
"metadata": { | ||
"id": "zZkBJYC7p9ny" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"from requests.adapters import HTTPAdapter\n", | ||
"from requests.packages.urllib3.util.retry import Retry" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 43, | ||
"id": "QpPNtiFMp9nz", | ||
"metadata": { | ||
"id": "QpPNtiFMp9nz" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# This would make us scrape in sessions so that if we have network issues\n", | ||
"# we wont need to start scraping again we would just revert to the previous\n", | ||
"# session tracked\n", | ||
"session = requests.Session()\n", | ||
"retry = Retry(connect=3, backoff_factor=0.5)\n", | ||
"adapter = HTTPAdapter(max_retries=retry)\n", | ||
"session.mount('http://', adapter)\n", | ||
"session.mount('https://', adapter)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 44, | ||
"id": "wPSVrDGvp9nz", | ||
"metadata": { | ||
"id": "wPSVrDGvp9nz" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Save in this file\n", | ||
"csv_file = open(\"ikoyi.csv\", \"w\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 45, | ||
"id": "1na0vgB0p9n0", | ||
"metadata": { | ||
"id": "1na0vgB0p9n0" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"csv_writer = csv.writer(csv_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 46, | ||
"id": "6l9nahUbp9n1", | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "6l9nahUbp9n1", | ||
"outputId": "f174246d-9639-4be6-e70e-a4622271f444" | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"89" | ||
] | ||
}, | ||
"execution_count": 46, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"csv_writer.writerow([\"Title\", \"Location\", \"Price\",\"Serviced\", \"Newly Built\", \"Furnished\", \"Bedrooms\", \"Bathrooms\", \"Toilets\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "Vig6nktLp9n3", | ||
"metadata": { | ||
"id": "Vig6nktLp9n3" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Get Location data\n", | ||
"for page in range(0, 484):\n", | ||
" # Replace the ikoyi with the city you want e.g. surulere, ajah, ikeja, lekki\n", | ||
" link = session.get(f\"https://www.propertypro.ng/property-for-sale/in/lagos/ikoyi?page={page}\").text \n", | ||
" soup = BeautifulSoup(link, \"lxml\")\n", | ||
" \n", | ||
" for listing in soup.find_all(\"div\", class_=\"single-room-sale listings-property\"):\n", | ||
" title = listing.find(\"h3\", class_=\"listings-property-title2\").text\n", | ||
" location = listing.find_all(\"h4\")[1].text\n", | ||
"\n", | ||
" price_info = listing.find(\"h3\", class_=\"listings-price\")\n", | ||
" price = price_info.find_all(\"span\")[1].text\n", | ||
"\n", | ||
" if listing.find(\"a\", href=\"/property-for-sale/is-new\"):\n", | ||
" is_new = 1\n", | ||
" else:\n", | ||
" is_new = 0\n", | ||
" \n", | ||
" if listing.find(\"a\", href=\"/property-for-sale/is-serviced\"):\n", | ||
" serviced = 1\n", | ||
" else:\n", | ||
" serviced = 0\n", | ||
" \n", | ||
" if listing.find(\"a\", href=\"/property-for-sale/is-furnished\"):\n", | ||
" furnished = 1\n", | ||
" else:\n", | ||
" furnished = 0\n", | ||
" \n", | ||
" more_info = listing.find(\"div\", class_=\"fur-areea\")\n", | ||
" bed = more_info.find_all(\"span\")[0].text\n", | ||
" bathroom = more_info.find_all(\"span\")[1].text\n", | ||
" toilet = more_info.find_all(\"span\")[2].text\n", | ||
"\n", | ||
" csv_writer.writerow([title, location, price, serviced, is_new, furnished, bed, bathroom, toilet])\n", | ||
"print(\"Done\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "HdZmwO_dp9n5", | ||
"metadata": { | ||
"id": "HdZmwO_dp9n5" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"csv_file.close()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"collapsed_sections": [], | ||
"name": "1 House Web Scraper.ipynb", | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3.9.12 ('base')", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
}, | ||
"toc": { | ||
"base_numbering": 1, | ||
"nav_menu": {}, | ||
"number_sections": true, | ||
"sideBar": true, | ||
"skip_h1_title": false, | ||
"title_cell": "Table of Contents", | ||
"title_sidebar": "Contents", | ||
"toc_cell": false, | ||
"toc_position": {}, | ||
"toc_section_display": true, | ||
"toc_window_display": false | ||
}, | ||
"vscode": { | ||
"interpreter": { | ||
"hash": "397fa5c989aefacd265b089cd9fa90a007575229207dba4eead3a1aa550249df" | ||
} | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.