Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Sadi Çelik committed Nov 6, 2021
0 parents commit 8e6220a
Show file tree
Hide file tree
Showing 133 changed files with 32,932 additions and 0 deletions.
14 changes: 14 additions & 0 deletions DrugBank-playwright-html-download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
browser = p.webkit.launch()
page = browser.new_page()

for page_number in range(1,125):
page.goto("https://go.drugbank.com/pharmaco/metabolomics?page={}".format(page_number))
page.wait_for_timeout(10000)
# Change the file location in your device
with open("pharmaco-metabolomics-data\pharmaco-metabolomics-page{}_markup.html".format(page_number), "w", encoding="utf-8") as file:
file.write(page.content())

browser.close()
45 changes: 45 additions & 0 deletions Drugbank-Web-Parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas as pd
from bs4 import BeautifulSoup

def extract_text(soup_obj, tag, attribute_name, attribute_value):
txt = soup_obj.find(tag, {attribute_name: attribute_value}).text.strip() if soup_obj.find(tag, {attribute_name: attribute_value}) else ''
return txt

rows = []

for page_number in range(1,125):
# Change the file location in your device
with open("pharmaco-metabolomics-data\pharmaco-metabolomics-page{}_markup.html".format(page_number), "r") as file:
data = file.read()
soup = BeautifulSoup(data, 'html.parser')
drug_metabolite_table = soup.find('table', {'class': 'table-metabolite-regulations table table-bordered'})
drug_metabolite_table_body = soup.find("tbody")
drug_metabolite_table_body_row = drug_metabolite_table_body.find_all("tr")

for row in drug_metabolite_table_body_row:
temp_data = row.find_all("td")
drug = temp_data[0].text
metabolite = temp_data[2].text
change = temp_data[3].text.strip()
if change == "increased":
increased = 1
decreased = 0
elif change == "decreased":
decreased = 1
increased = 0
else:
decreased = 0
increased = 0
description = temp_data[4].text

rows.append([drug,metabolite,change,increased,decreased,description])


# Create dataframe and export

columns = ["Drug", "Metabolite", "Change", "Increased", "Decreased", "Description"]

df = pd.DataFrame(data=rows, columns=columns)
df.sort_values(by="Drug", ascending=True, inplace=True)
df.to_csv('pharmaco-metabolomics-data.csv', index=False)
df.to_excel('pharmaco-metabolomics-data.xlsx', index=False)
7 changes: 7 additions & 0 deletions demos/DrugBank-Web-Scraper-Bypass-2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import cfscrape

url = "https://go.drugbank.com/pharmaco/metabolomics"
scraper = cfscrape.create_scraper(delay=15) # returns a CloudflareScraper instance

# Or: scraper = cfscrape.CloudflareScraper() # CloudflareScraper inherits from requests.Session
print(scraper.get(url).content) # => "<!DOCTYPE html><html><head>..."
8 changes: 8 additions & 0 deletions demos/DrugBank-Web-Scraper-Bypass-3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import undetected_chromedriver as uc

driver = uc.Chrome()
with driver:
driver.get('https://go.drugbank.com/pharmaco/metabolomics') # known url using cloudflare's "under attack mode"

with open("page_markup_bypass_3.html", "w", encoding="utf-8") as file:
file.write(driver.page_source)
9 changes: 9 additions & 0 deletions demos/DrugBank-Web-Scraper-Bypass.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import cloudscraper

# Or: scraper = cloudscraper.CloudScraper() # CloudScraper inherits from requests.Session

url = "https://go.drugbank.com/pharmaco/metabolomics"

scraper = cloudscraper.create_scraper(browser='chrome', debug=True)

print(scraper.get(url).text)
42 changes: 42 additions & 0 deletions demos/DrugBank-Web-Scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# DrugBank-Web Scraper

import time
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Initialize the driver
chrome_driver_path = "D:\Development\Python\Web Scarping\WebDriver\chromedriver.exe" # Change to your local driver location

# Options for CloudFlare bypass
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--disable-blink-features=AutomationControlled")

driver = webdriver.Chrome(options=options, executable_path=chrome_driver_path)

# Delay time
delay = 120

driver.get("https://go.drugbank.com/pharmaco/metabolomics")
time.sleep(delay)

with open("page_markup.html", "w", encoding="utf-8") as file:
file.write(driver.page_source)

# try:
# WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'card-body')))
# except TimeoutException:
# print('Loading exceeds delay time')
# #break
# else:
# with open("page_markup.html", "w", encoding="utf-8") as file:
# file.write(driver.page_source)
# finally:
# driver.quit()
11 changes: 11 additions & 0 deletions demos/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import requests
from bs4 import BeautifulSoup
import pandas as pd

page = requests.get("https://go.drugbank.com/pharmaco/metabolomics")
soup = BeautifulSoup(page.content, 'html.parser')

print(page)

# with open("drug_demo.html", "w", encoding="utf-8") as file:
# file.write(soup)
Loading

0 comments on commit 8e6220a

Please sign in to comment.