From a06f0a8c76c291432ff18735bfda7a0acbe407ba Mon Sep 17 00:00:00 2001 From: Alexandre Catarino Date: Fri, 24 May 2024 00:13:22 +0100 Subject: [PATCH] Improves Bulk Download Script Example --- ...tes.html => 03 Download Daily Updates.php} | 40 ++++--------- ...tes.html => 03 Download Daily Updates.php} | 42 ++++---------- ...tes.html => 03 Download Daily Updates.php} | 40 ++++--------- ...tes.html => 03 Download Daily Updates.php} | 39 ++++--------- ...tes.html => 03 Download Daily Updates.php} | 39 ++++--------- .../datasets/download_bulk_data_script.php | 58 +++++++++++++++++++ 6 files changed, 111 insertions(+), 147 deletions(-) rename 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/{03 Download Daily Updates.html => 03 Download Daily Updates.php} (61%) rename 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/{03 Download Daily Updates.html => 03 Download Daily Updates.php} (52%) rename 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/{03 Download Daily Updates.html => 03 Download Daily Updates.php} (56%) rename 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/{03 Download Daily Updates.html => 03 Download Daily Updates.php} (68%) rename 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/{03 Download Daily Updates.html => 03 Download Daily Updates.php} (67%) create mode 100644 Resources/datasets/download_bulk_data_script.php diff --git a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/03 Download Daily Updates.html b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/03 Download Daily Updates.php similarity index 61% rename from 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/03 Download Daily Updates.html rename to 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/03 Download Daily Updates.php index 679d49ffc7..4c512dc641 100644 --- a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/03 Download Daily Updates.html +++ b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/01 CFD Data/03 Download Daily Updates.php @@ -11,34 +11,14 @@

After you subscribe to dataset updates, to update your local copy of the CFD dataset, use the CLI Command Generator to generate your download command and then run it in a terminal in your organization workspace. Alternatively, instead of directly calling the lean data download command, you can place a Python script in the data directory of your organization workspace and run it to update your data files. The following example script updates all data resolutions:

-
-
import os
-from datetime import datetime
-from pytz import timezone
+
 
-# Define a method to download the data
-def download_data(resolution, overwrite=False):
-    print(f"Updating {resolution} data...")
-    command = f'lean data download --dataset "CFD Data" --data-type "Bulk" --resolution "{resolution}"'
-    if overwrite:
-        command += " --overwrite"
-    os.system(command)
-
-# Update minute and second data files
-END_DATE = datetime.now(timezone("US/Eastern")).strftime("%Y%m%d")
-new_data_available = False
-for resolution in ["second", "minute"]:
-    latest_date = sorted([f for f in os.listdir(f"cfd/oanda/{resolution}/xauusd")])[-1].split('_')[0]
-    if latest_date >= END_DATE:
-        print(f"{resolution} data is already up to date.")
-        continue
-    new_data_available = True
-    download_data(resolution)
-
-# Update daily and hourly data files
-if new_data_available:
-    for resolution in ["hour", "daily"]:
-        download_data(resolution, True)
-
- -

The preceding script checks the date of the most recent XAUUSD data you have for second and minute resolutions. If there is new data available for either of these resolutions, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

+

The preceding script checks the date of the most recent XAUUSD data you have for second and minute resolutions. If there is new data available for either of these resolutions, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

\ No newline at end of file diff --git a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/03 Download Daily Updates.html b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/03 Download Daily Updates.php similarity index 52% rename from 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/03 Download Daily Updates.html rename to 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/03 Download Daily Updates.php index 1c145ad190..347c134557 100644 --- a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/03 Download Daily Updates.html +++ b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/02 FOREX Data/03 Download Daily Updates.php @@ -11,34 +11,14 @@

After you subscribe to dataset updates, to update your local copy of the Forex dataset, use the CLI Command Generator to generate your download command and then run it in a terminal in your organization workspace. Alternatively, instead of directly calling the lean data download command, you can place a Python script in the data directory of your organization workspace and run it to update your data files. The following example script updates all data resolutions:

-
-
import os
-from datetime import datetime
-from pytz import timezone
-
-# Define a method to download the data
-def download_data(resolution, overwrite=False):
-    print(f"Updating {resolution} data...")
-    command = f'lean data download --dataset "FOREX Data" --data-type "Bulk" --resolution "{resolution}"'
-    if overwrite:
-        command += " --overwrite"
-    os.system(command)
-
-# Update minute and second data files
-END_DATE = datetime.now(timezone("US/Eastern")).strftime("%Y%m%d")
-new_data_available = False
-for resolution in ["second", "minute"]:
-    latest_date = sorted([f for f in os.listdir(f"forex/oanda/{resolution}/eurusd")])[-1].split('_')[0]
-    if latest_date >= END_DATE:
-        print(f"{resolution} data is already up to date.")
-        continue
-    new_data_available = True
-    download_data(resolution)
-
-# Update daily and hourly data files
-if new_data_available:
-    for resolution in ["hour", "daily"]:
-        download_data(resolution, True)
-
- -

To update your local dataset, the preceding script checks the date of the most recent EURUSD data you have for second and minute resolutions. If there is new data available for either of these resolutions, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

+ + +

To update your local dataset, the preceding script checks the date of the most recent EURUSD data you have for all resolutions. If there is new data available for either of these resolutions, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

\ No newline at end of file diff --git a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/03 Download Daily Updates.html b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/03 Download Daily Updates.php similarity index 56% rename from 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/03 Download Daily Updates.html rename to 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/03 Download Daily Updates.php index 645128537f..a3bad26ea2 100644 --- a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/03 Download Daily Updates.html +++ b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/03 US Equities/03 Download Daily Updates.php @@ -15,34 +15,14 @@

Alternatively, instead of directly calling the lean data download command, you can place a Python script in the data directory of your organization workspace and run it to update your data files. The following example script updates all data resolutions:

-
-
import os
-from datetime import datetime
-from pytz import timezone
+
 
-# Define a method to download the data
-def download_data(resolution, overwrite=False):
-    print(f"Updating {resolution} data...")
-    command = f'lean data download --dataset "US Equities" --data-type "Bulk" --resolution "{resolution}"'
-    if overwrite:
-        command += " --overwrite"
-    os.system(command)
-
-# Update minute, second, and tick data files
-END_DATE = datetime.now(timezone("US/Eastern")).strftime("%Y%m%d")
-new_data_available = False
-for resolution in ["tick", "second", "minute"]:
-    latest_date = sorted([f for f in os.listdir(f"equity/usa/{resolution}/spy")])[-1].split('_')[0]
-    if latest_date >= END_DATE:
-        print(f"{resolution} data is already up to date.")
-        continue
-    new_data_available = True
-    download_data(resolution)
-
-# Update daily and hourly data files
-if new_data_available:
-    for resolution in ["hour", "daily"]:
-        download_data(resolution, True)
-
- -

The preceding script checks the date of the most recent SPY data you have for tick, second, and minute resolutions. If there is new data available for any of these resolutions, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

+

The preceding script checks the date of the most recent SPY data you have for all resolutions. If there is new data available for any of these resolutions, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

\ No newline at end of file diff --git a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/03 Download Daily Updates.html b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/03 Download Daily Updates.php similarity index 68% rename from 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/03 Download Daily Updates.html rename to 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/03 Download Daily Updates.php index f6aa53cb93..a37a10b339 100644 --- a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/03 Download Daily Updates.html +++ b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/05 US Equity Options/03 Download Daily Updates.php @@ -13,31 +13,14 @@

Alternatively, instead of directly calling the lean data download command, you can place a Python script in the data directory of your organization workspace and run it to update your data files. The following example script updates all data resolutions:

-
-
import os
-from datetime import datetime
-from pytz import timezone
-
-# Define a method to download the data
-def download_data(resolution, overwrite=False):
-    print(f"Updating {resolution} data...")
-    command = f'lean data download --dataset "US Equity Options" --data-type "Bulk" --option-style "American" --resolution "{resolution}"'
-    if overwrite:
-        command += " --overwrite"
-    os.system(command)
-
-# Update data files
-END_DATE = datetime.now(timezone("US/Eastern")).strftime("%Y%m%d")
-latest_date = sorted([f for f in os.listdir(f"option/usa/minute/aapl")])[-1].split('_')[0]
-if latest_date >= END_DATE:
-    print(f"Your data is already up to date.")
-else:
-    download_data("minute")
-    for resolution in ['hour', 'daily']:
-        download_data(resolution, True)
-
- -

The preceding script checks the date of the most recent minute resolution data you have for AAPL. If there is new minute data available, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

- - - + + +

The preceding script checks the date of the most recent minute resolution data you have for AAPL. If there is new minute data available, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

\ No newline at end of file diff --git a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/03 Download Daily Updates.html b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/03 Download Daily Updates.php similarity index 67% rename from 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/03 Download Daily Updates.html rename to 05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/03 Download Daily Updates.php index 996e197246..934941e30b 100644 --- a/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/03 Download Daily Updates.html +++ b/05 Lean CLI/05 Datasets/05 QuantConnect/02 Download in Bulk/08 US Index Options/03 Download Daily Updates.php @@ -8,31 +8,14 @@

After you subscribe to dataset updates, to update your local copy of the US Index Options dataset, use the CLI Command Generator to generate your download command and then run it in a terminal in your organization workspace. Alternatively, instead of directly calling the lean data download command, you can place a Python script in the data directory of your organization workspace and run it to update your data files. The following example script updates all data resolutions:

-
-
import os
-from datetime import datetime
-from pytz import timezone
-
-# Define a method to download the data
-def download_data(resolution, overwrite=False):
-    print(f"Updating {resolution} data...")
-    command = f'lean data download --dataset "US Index Options" --data-type "Bulk" --resolution "{resolution}"'
-    if overwrite:
-        command += " --overwrite"
-    os.system(command)
-
-# Update data files
-END_DATE = datetime.now(timezone("US/Eastern")).strftime("%Y%m%d")
-latest_date = sorted([f for f in os.listdir(f"indexoption/usa/minute/spx")])[-1].split('_')[0]
-if latest_date >= END_DATE:
-    print(f"Your data is already up to date.")
-else:
-    download_data("minute")
-    for resolution in ['hour', 'daily']:
-        download_data(resolution, True)
-
- -

The preceding script checks the date of the most recent minute resolution data you have for SPX. If there is new minute data available, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

- - - + + +

The preceding script checks the date of the most recent minute resolution data you have for SPX. If there is new minute data available, it downloads the new data files and overwrites your hourly and daily files. If you don't intend to download all resolutions, adjust this script to your needs.

\ No newline at end of file diff --git a/Resources/datasets/download_bulk_data_script.php b/Resources/datasets/download_bulk_data_script.php new file mode 100644 index 0000000000..c46b292d51 --- /dev/null +++ b/Resources/datasets/download_bulk_data_script.php @@ -0,0 +1,58 @@ +
+
import os
+import pandas as pd
+from datetime import datetime, time, timedelta
+from pytz import timezone
+from os.path import abspath, dirname
+os.chdir(dirname(abspath(__file__)))
+
+OVERWRITE = False
+
+# Define a method to download the data
+def __download_data(resolution, start=None, end=None):
+    print(f"Updating {resolution} data...")
+    command = f'lean data download --dataset "" --data-type "Bulk"  --resolution "{resolution}"'
+    if start:
+        end = end if end else start
+        command += f" --start {start} --end {end}"
+    if OVERWRITE:
+        command += " --overwrite"
+    print(command)
+    os.system(command)
+
+def __get_end_date() -> str:
+    now = datetime.now(timezone("US/Eastern"))
+    if now.time() > time(7,30):
+        return (now - timedelta(1)).strftime("%Y%m%d")
+    print('New data is available at 07:30 AM EST')
+    return (now - timedelta(2)).strftime("%Y%m%d")
+
+def __download_high_frequency_data(latest_on_cloud):
+    for resolution in :
+        dir_name = f"//{resolution}/".lower()
+        if not os.path.exists(dir_name):
+            __download_data(resolution, '19980101')
+            continue
+        latest_on_disk = sorted(os.listdir(dir_name))[-1].split('_')[0]
+        if latest_on_disk >= latest_on_cloud:
+            print(f"{resolution} data is already up to date.")
+            continue
+        __download_data(resolution, latest_on_disk, latest_on_cloud)
+
+def __download_low_frequency_data(latest_on_cloud):
+    for resolution in ["daily", "hour"]:
+        file_name = f"//{resolution}/.zip".lower()
+        if not os.path.exists(file_name):
+            __download_data(resolution)
+            continue
+        latest_on_disk = str(pd.read_csv(file_name, header=None)[0].iloc[-1])[:8]
+        if latest_on_disk >= latest_on_cloud:
+            print(f"{resolution} data is already up to date.")
+            continue
+        __download_data(resolution)
+
+if __name__ == "__main__":
+    latest_on_cloud = __get_end_date()
+    __download_low_frequency_data(latest_on_cloud)
+    __download_high_frequency_data(latest_on_cloud)
+
\ No newline at end of file