Skip to content

Commit

Permalink
jlcparts_db_convert.py fts5 - Data cleanup to reduce the database size
Browse files Browse the repository at this point in the history
* Baseline

jlcparts database (cache.sqlite3): 6.0 GB
parts-fts5.db: 4.7 GB
parts-fts5.db.zip: 679.0 MB
Elapsed time: 4 minutes and 57.33 seconds

* Remove duplicate package and category from description, convert double spaces
to single spaces and remove leading and trailing spaces from description.

jlcparts database (cache.sqlite3): 6.0 GB
parts-fts5.db: 4.0 GB
parts-fts5.db.zip: 559.3 MB
Elapsed time: 4 minutes and 9.77 seconds

Reduction of ~700MB (~15%) in database size.
  • Loading branch information
chmorgan authored and Bouni committed Apr 18, 2024
1 parent 41e51f9 commit ff5d611
Showing 1 changed file with 94 additions and 0 deletions.
94 changes: 94 additions & 0 deletions jlcparts_db_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,100 @@ def create_tables(self):
"""
)

def load_tables(self):
"""Load the input data into the output database."""

# load the tables into memory
print("Reading manufacturers")
res = self.conn_jp.execute("SELECT * FROM manufacturers")
mans = dict(res.fetchall())

print("Reading categories")
res = self.conn_jp.execute("SELECT * FROM categories")
cats = {i: (c, sc) for i, c, sc in res.fetchall()}

res = self.conn_jp.execute("select count(*) from components")
results = res.fetchone()
print(f"{humanize.intcomma(results[0])} parts to import")

self.part_count = 0
print("Reading components")
res = self.conn_jp.execute("SELECT * FROM components")
while True:
comps = res.fetchmany(size=100000)

print(f"Read {humanize.intcomma(len(comps))} parts")

# if we have no more parts exit out of the loop
if len(comps) == 0:
break

self.part_count += len(comps)

# now extract the data from the jlcparts db and fill
# it into the plugin database
print("Building parts rows to insert")
rows = []
for c in comps:
price = json.loads(c[10])
price_str = ",".join(
[
f"{entry.get('qFrom')}-{entry.get('qTo') if entry.get('qTo') is not None else ''}:{entry.get('price')}"
for entry in price
]
)

description = c[7]

# strip ROHS out of descriptions where present
# and add 'not ROHS' where ROHS is not present
# as 99% of parts are ROHS at this point
if " ROHS".lower() not in description.lower():
description += " not ROHS"
else:
description = description.replace(" ROHS", "")

second_category = cats[c[1]][1]

# strip the 'Second category' out of the description if it
# is duplicated there
description = description.replace(second_category, "")

package = c[3]

# remove 'Package' from the description if it is duplicated there
description = description.replace(package, "")

# replace double spaces with single spaces in description
description.replace(" ", " ")

# remove trailing spaces from description
description = description.strip()

row = (
f"C{c[0]}", # LCSC Part
cats[c[1]][0], # First Category
cats[c[1]][1], # Second Category
c[2], # MFR.Part
package, # Package
int(c[4]), # Solder Joint
mans[c[5]], # Manufacturer
"Basic" if c[6] else "Extended", # Library Type
description, # Description
c[8], # Datasheet
price_str, # Price
str(c[9]), # Stock
)
rows.append(row)

print("Inserting into parts table")
self.conn.executemany(
"INSERT INTO parts VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", rows
)
self.conn.commit()

print("Done importing parts")

def populate_categories(self):
"""Populate the categories table."""
self.conn.execute(
Expand Down

0 comments on commit ff5d611

Please sign in to comment.