-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_aircraft_db.py
218 lines (192 loc) · 8.57 KB
/
update_aircraft_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import os
import requests
import pandas as pd
import sqlite3
from bs4 import BeautifulSoup
from contextlib import closing
import csv
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Constants
BASE_URL = "https://opensky-network.org/datasets/metadata/"
DB_FILENAME = "aircraftData.db"
TABLE_NAME = "aircraft_type"
LOG_FILENAME = "download_log.txt"
BOEING_747_CSV = "boeing_747_icao.csv"
def get_latest_csv_version(base_url):
logger.info("Fetching the latest dataset version...")
try:
response = requests.get(base_url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
files = [a['href'] for a in soup.find_all('a', href=True) if 'aircraft-database-complete' in a['href'] and a['href'].endswith('.csv')]
if files:
latest_file = sorted(files)[-1]
logger.info(f"Found latest version: {latest_file}")
return latest_file
else:
logger.warning("No CSV files found on the page.")
return None
else:
logger.error("Failed to retrieve data from OpenSky Network.")
return None
except Exception as e:
logger.error(f"An error occurred: {e}")
return None
def download_csv(url, filename):
logger.info(f"Downloading {url}...")
try:
response = requests.get(url)
response.raise_for_status()
with open(filename, 'wb') as f:
f.write(response.content)
logger.info("Download completed.")
return True
except Exception as e:
logger.error(f"Failed to download the file: {e}")
return False
def csv_to_sqlite(csv_file, db_file):
logger.info(f"Converting the entire CSV {csv_file} to SQLite with table {TABLE_NAME}...")
try:
df = pd.read_csv(csv_file, low_memory=False, on_bad_lines='skip', quotechar="'", escapechar='\\')
# Remove quotes from column names
df.columns = df.columns.str.replace("'", "")
with closing(sqlite3.connect(db_file)) as conn:
df.to_sql(TABLE_NAME, conn, if_exists='replace', index=False)
logger.info(f"SQLite database updated with {TABLE_NAME} data.")
return True
except Exception as e:
logger.error(f"An error occurred while converting CSV to SQLite: {e}")
return False
def get_column_names(db_file):
logger.info("Getting column names from the database...")
try:
with closing(sqlite3.connect(db_file)) as conn:
cursor = conn.cursor()
cursor.execute(f"PRAGMA table_info({TABLE_NAME})")
columns = [row[1] for row in cursor.fetchall()]
logger.info(f"Column names: {columns}")
return columns
except Exception as e:
logger.error(f"An error occurred while getting column names: {e}")
return []
def print_sample(db_file, columns):
logger.info("Printing a sample of the database contents...")
try:
with closing(sqlite3.connect(db_file)) as conn:
sample_query = f"""
SELECT {', '.join(columns[:6])}
FROM {TABLE_NAME}
LIMIT 20
"""
df = pd.read_sql_query(sample_query, conn)
logger.info("Sample of database contents:")
logger.info(df.to_string())
except Exception as e:
logger.error(f"An error occurred while printing sample data: {e}")
def filter_boeing_747(db_file, columns):
logger.info("Filtering for Boeing 747 aircraft...")
try:
with closing(sqlite3.connect(db_file)) as conn:
# Check total number of records
total_count = pd.read_sql_query(f"SELECT COUNT(*) FROM {TABLE_NAME}", conn).iloc[0, 0]
logger.info(f"Total records in database: {total_count}")
# Determine which columns to use for filtering
manufacturer_col = next((col for col in columns if 'manufacturer' in col.lower()), None)
model_col = next((col for col in columns if 'model' in col.lower()), None)
type_col = next((col for col in columns if 'type' in col.lower()), None)
if not all([manufacturer_col, model_col, type_col]):
logger.error("Could not find necessary columns for filtering")
return False
# Check for any Boeing aircraft
boeing_query = f"""
SELECT COUNT(*) FROM {TABLE_NAME}
WHERE LOWER({manufacturer_col}) LIKE '%boeing%'
OR LOWER({model_col}) LIKE '%boeing%'
OR LOWER({type_col}) LIKE 'b7%'
"""
boeing_count = pd.read_sql_query(boeing_query, conn).iloc[0, 0]
logger.info(f"Total Boeing aircraft: {boeing_count}")
# Check for 747 in any field
query_747 = f"""
SELECT COUNT(*) FROM {TABLE_NAME}
WHERE LOWER({model_col}) LIKE '%747%'
OR LOWER({type_col}) LIKE '%747%'
OR LOWER({type_col}) LIKE 'b74%'
"""
count_747 = pd.read_sql_query(query_747, conn).iloc[0, 0]
logger.info(f"Total aircraft with '747' in any field: {count_747}")
# Updated query to be more inclusive
icao24_col = next((col for col in columns if 'icao24' in col.lower()), None)
registration_col = next((col for col in columns if 'registration' in col.lower()), None)
query = f"""
SELECT {icao24_col}, {registration_col}, {manufacturer_col}, {model_col}, {type_col}
FROM {TABLE_NAME}
WHERE LOWER({model_col}) LIKE '%747%'
OR LOWER({type_col}) LIKE '%747%'
OR LOWER({type_col}) LIKE 'b74%'
OR (LOWER({manufacturer_col}) LIKE '%boeing%' AND LOWER({model_col}) LIKE '%747%')
"""
df = pd.read_sql_query(query, conn)
if df.empty:
logger.warning("No Boeing 747 aircraft found in the database.")
return False
df.to_csv(BOEING_747_CSV, index=False)
logger.info(f"Boeing 747 aircraft data saved to {BOEING_747_CSV}")
logger.info(f"Number of Boeing 747 aircraft: {len(df)}")
logger.info("Sample of found aircraft:")
logger.info(df.head().to_string())
return True
except Exception as e:
logger.error(f"An error occurred while filtering Boeing 747 data: {e}")
return False
def check_csv_structure(csv_file):
logger.info(f"Checking structure of CSV file: {csv_file}")
try:
df = pd.read_csv(csv_file, nrows=5, quotechar="'", escapechar='\\')
logger.info("First few rows of CSV:")
logger.info(df.to_string())
return True
except Exception as e:
logger.error(f"An error occurred while checking CSV structure: {e}")
return False
def ensure_database_updated(csv_file, db_file):
logger.info(f"Ensuring database {db_file} is up to date...")
return csv_to_sqlite(csv_file, db_file)
def main():
latest_version = get_latest_csv_version(BASE_URL)
if latest_version:
latest_csv_url = BASE_URL + latest_version
latest_csv_filename = latest_version.split('/')[-1]
if not os.path.exists(latest_csv_filename):
if not download_csv(latest_csv_url, latest_csv_filename):
logger.error("Failed to download the latest version.")
return
if check_csv_structure(latest_csv_filename):
if ensure_database_updated(latest_csv_filename, DB_FILENAME):
columns = get_column_names(DB_FILENAME)
if columns:
print_sample(DB_FILENAME, columns)
if filter_boeing_747(DB_FILENAME, columns):
logger.info("Boeing 747 CSV file created/updated.")
else:
logger.error("Failed to create/update Boeing 747 CSV file.")
else:
logger.error("Failed to get column names from the database.")
else:
logger.error("Failed to ensure database is updated.")
else:
logger.error("CSV file structure is not as expected.")
# Cleanup
if os.path.exists(latest_csv_filename):
os.remove(latest_csv_filename)
logger.info("Cleanup complete.")
# Update log file
with open(LOG_FILENAME, 'w') as log_file:
log_file.write(latest_csv_filename)
else:
logger.error("Failed to identify latest version.")
if __name__ == "__main__":
main()