From f879851466ff4950f08739011ace532f1b04498f Mon Sep 17 00:00:00 2001 From: Ian Wang <22849821+wangyinz@users.noreply.github.com> Date: Wed, 8 Jan 2025 21:51:03 -0600 Subject: [PATCH 1/8] add openjdk-8-jdk for spark --- .github/workflows/python-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 7754d8abd..220871d2d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -48,6 +48,7 @@ jobs: export PATH=$PATH:/opt/spark-3.0.0-bin-hadoop2.7/bin echo "/opt/spark-3.0.0-bin-hadoop2.7/bin" >> $GITHUB_PATH python -m pip install --upgrade --upgrade-strategy eager pyspark pytest-spark + sudo apt-get update && sudo apt-get install openjdk-8-jdk - name: Install run: python -m pip install -C--global-option=build -C--global-option=--debug -v '.[seisbench]' - name: Test with pytest From f07f06e75973a0bac8eaf38ea85697251e7fffdf Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 12:13:55 +0800 Subject: [PATCH 2/8] relax boost requirement to 1.74 --- cxx/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cxx/CMakeLists.txt b/cxx/CMakeLists.txt index 79f211c43..111b78156 100644 --- a/cxx/CMakeLists.txt +++ b/cxx/CMakeLists.txt @@ -46,7 +46,7 @@ endif () message (STATUS "YAML_CPP_LIBRARIES = ${YAML_CPP_LIBRARIES}") message (STATUS "YAML_CPP_INCLUDE_DIR = ${YAML_CPP_INCLUDE_DIR}") -find_package (Boost 1.86.0 COMPONENTS serialization) +find_package (Boost 1.74.0 COMPONENTS serialization) if (NOT Boost_FOUND) message (STATUS "Building Boost") include (cmake/boost.cmake) From c1d61aa45c858b2ef7f78bde493a34de601f85eb Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 12:21:59 +0800 Subject: [PATCH 3/8] update spark to 3.5.4 matching pyspark --- .github/workflows/python-package.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 220871d2d..6e7bb245a 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -40,14 +40,14 @@ jobs: - name: Install Apache Spark run: | mkdir -p /opt - wget -q -O /opt/spark.tgz https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz + wget -q -O /opt/spark.tgz https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz tar xzf /opt/spark.tgz -C /opt/ rm /opt/spark.tgz - export SPARK_HOME=/opt/spark-3.0.0-bin-hadoop2.7 - echo "SPARK_HOME=/opt/spark-3.0.0-bin-hadoop2.7" >> $GITHUB_ENV - export PATH=$PATH:/opt/spark-3.0.0-bin-hadoop2.7/bin - echo "/opt/spark-3.0.0-bin-hadoop2.7/bin" >> $GITHUB_PATH - python -m pip install --upgrade --upgrade-strategy eager pyspark pytest-spark + export SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3 + echo "SPARK_HOME=/opt/spark-3.5.4-bin-hadoop3" >> $GITHUB_ENV + export PATH=$PATH:/opt/spark-3.5.4-bin-hadoop3/bin + echo "/opt/spark-3.5.4-bin-hadoop3/bin" >> $GITHUB_PATH + PYSPARK_HADOOP_VERSION=3 python -m pip install --upgrade --upgrade-strategy eager pyspark pytest-spark sudo apt-get update && sudo apt-get install openjdk-8-jdk - name: Install run: python -m pip install -C--global-option=build -C--global-option=--debug -v '.[seisbench]' From 5bc6e3231c889b243818e8466b007cb99b3c1418 Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 14:43:15 +0800 Subject: [PATCH 4/8] add computation for sanity check --- python/mspasspy/client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/mspasspy/client.py b/python/mspasspy/client.py index af399d20f..c6a6138f0 100644 --- a/python/mspasspy/client.py +++ b/python/mspasspy/client.py @@ -195,6 +195,7 @@ def __init__( .master(self._spark_master_url) .getOrCreate() ) + spark.parallelize([1, 2, 3, 4, 5]).collect() self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( From 3f2d358d5e68eae808a765fd4dd0cd7eb86f9e3c Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 15:37:00 +0800 Subject: [PATCH 5/8] fix wrong use of sparkSession vs sparkContext --- python/mspasspy/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mspasspy/client.py b/python/mspasspy/client.py index c6a6138f0..28b283f97 100644 --- a/python/mspasspy/client.py +++ b/python/mspasspy/client.py @@ -195,7 +195,7 @@ def __init__( .master(self._spark_master_url) .getOrCreate() ) - spark.parallelize([1, 2, 3, 4, 5]).collect() + spark.sparkContext.parallelize([1, 2, 3, 4, 5]).collect() self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( From ec6769c845f13b53fcbf4963e4ee72d0cad9ad89 Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 17:06:41 +0800 Subject: [PATCH 6/8] add debug --- python/mspasspy/client.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/mspasspy/client.py b/python/mspasspy/client.py index 28b283f97..788025496 100644 --- a/python/mspasspy/client.py +++ b/python/mspasspy/client.py @@ -195,7 +195,11 @@ def __init__( .master(self._spark_master_url) .getOrCreate() ) - spark.sparkContext.parallelize([1, 2, 3, 4, 5]).collect() + print("DEBUG====================") + print(spark.sparkContext.parallelize([1, 2, 3, 4, 5]).collect()) + print(spark) + print(spark.sparkContext) + print("DEBUG END================") self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( From e0cc49a592868669bc51e91d94519dce2cbf0682 Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 18:13:02 +0800 Subject: [PATCH 7/8] change the mocked method --- python/mspasspy/client.py | 5 ----- python/tests/test_mspass_client_spark_dask.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/python/mspasspy/client.py b/python/mspasspy/client.py index 788025496..af399d20f 100644 --- a/python/mspasspy/client.py +++ b/python/mspasspy/client.py @@ -195,11 +195,6 @@ def __init__( .master(self._spark_master_url) .getOrCreate() ) - print("DEBUG====================") - print(spark.sparkContext.parallelize([1, 2, 3, 4, 5]).collect()) - print(spark) - print(spark.sparkContext) - print("DEBUG END================") self._spark_context = spark.sparkContext except Exception as err: raise MsPASSError( diff --git a/python/tests/test_mspass_client_spark_dask.py b/python/tests/test_mspass_client_spark_dask.py index 2b80473d9..ef9a7d614 100644 --- a/python/tests/test_mspass_client_spark_dask.py +++ b/python/tests/test_mspass_client_spark_dask.py @@ -161,7 +161,7 @@ def test_spark_scheduler(self, monkeypatch): monkeypatch.setenv("MSPASS_SCHEDULER", "spark") monkeypatch.setenv("MSPASS_SCHEDULER_ADDRESS", "168.0.0.1") monkeypatch.setenv("SPARK_MASTER_PORT", "12345") - monkeypatch.setattr(SparkSession.builder, "appName", mock_excpt) + monkeypatch.setattr(SparkSession, "builder", mock_excpt) with pytest.raises( MsPASSError, match="Runntime error: cannot create a spark configuration with: spark://168.0.0.1:12345", @@ -257,7 +257,7 @@ def test_set_scheduler(self, monkeypatch): assert self.client._dask_client == temp_dask_client # test set spark, previous is dask - monkeypatch.setattr(SparkSession.builder, "config", mock_excpt) + monkeypatch.setattr(SparkSession, "builder", mock_excpt) with pytest.raises( MsPASSError, match="Runntime error: cannot create a spark configuration with: spark://168.1.2.3:7077", From 019b351a5fe4863f4fced096a2e7ac2724eede45 Mon Sep 17 00:00:00 2001 From: wangyinz Date: Thu, 9 Jan 2025 18:34:41 +0800 Subject: [PATCH 8/8] mock all SparkSession.builder --- python/tests/test_mspass_client_spark_dask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tests/test_mspass_client_spark_dask.py b/python/tests/test_mspass_client_spark_dask.py index ef9a7d614..c441959b5 100644 --- a/python/tests/test_mspass_client_spark_dask.py +++ b/python/tests/test_mspass_client_spark_dask.py @@ -150,7 +150,7 @@ def test_dask_scheduler(self, monkeypatch): monkeypatch.undo() def test_spark_scheduler(self, monkeypatch): - monkeypatch.setattr(SparkSession.builder, "appName", mock_excpt) + monkeypatch.setattr(SparkSession, "builder", mock_excpt) with pytest.raises( MsPASSError, match="Runntime error: cannot create a spark configuration with: spark://168.0.0.1", @@ -271,7 +271,7 @@ def test_set_scheduler(self, monkeypatch): # test set spark, previous is spark test_client_2 = Client(scheduler="spark") - monkeypatch.setattr(SparkSession.builder, "config", mock_excpt) + monkeypatch.setattr(SparkSession, "builder", mock_excpt) with pytest.raises( MsPASSError, match="Runntime error: cannot create a spark configuration with: spark://123.4.5.6:7077",