Merge pull request #2047 from rapidsai/branch-22.02

[RELEASE] cugraph v22.02
rapidsai · Feb 2, 2022 · 47c0c22 · 47c0c22
2 parents 3a43e9d + 2e0e345
commit 47c0c22
Show file tree

Hide file tree

Showing 323 changed files with 12,739 additions and 3,970 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ __pycache__
 .cache
 .coverage
 .vscode
+.lock
 *.swp
 *.pytest_cache
 DartConfiguration.tcl
@@ -32,6 +33,9 @@ dist/
 cugraph.egg-info/
 python/build
 python/cugraph/bindings/*.cpp
+
+## pylibcugraph build directories & artifacts
+python/pylibcugraph/pylibcugraph.egg-info
 
 ## Patching
 *.diff
@@ -82,6 +86,9 @@ python/_external_repositories/
 
 # created by Dask tests
 python/dask-worker-space
+python/cugraph/dask-worker-space
+python/cugraph/cugraph/dask-worker-space
+python/cugraph/cugraph/tests/dask-worker-space
 
 # Sphinx docs & build artifacts
 docs/cugraph/source/api_docs/api/*
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/Dockerfile b/Dockerfile
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ The [RAPIDS](https://rapids.ai) cuGraph library is a collection of GPU accelerat
 As an example, the following Python snippet loads graph data and computes PageRank:
 
 ```python
+import cudf
 import cugraph
 
 # read data into a cuDF DataFrame using read_csv
@@ -78,7 +79,7 @@ _Italic_ algorithms are planned for future releases.
 | Link Analysis|                                        |              |                     |
 |              | Pagerank                               | Multi-GPU    | [C++ README](cpp/src/centrality/README.md#Pagerank) |
 |              | Personal Pagerank                      | Multi-GPU    | [C++ README](cpp/src/centrality/README.md#Personalized-Pagerank) |
-|              | HITS                                   | Single-GPU   | leverages Gunrock   |
+|              | HITS                                   | Single-GPU   | Multi-GPU C code is ready, Python wrapper in 22.04                    |
 | Link Prediction |                                     |              |                     |
 |              | Jaccard Similarity                     | Single-GPU   |                     |
 |              | Weighted Jaccard Similarity            | Single-GPU   |                     |
@@ -87,7 +88,8 @@ _Italic_ algorithms are planned for future releases.
 |              | _Local Clustering Coefficient_         |   ---        |                     |
 | Sampling     |                                        |              |                     |
 |              | Random Walks (RW)                      | Single-GPU   | Biased and Uniform  |
-|              | _node2vec_                             |   ---        |                    |
+|              | Egonet                                 | Single-GPU   | multi-seed          |
+|              | _node2vec_                             |   ---        | C code is ready, Python wrapper coming in 22.04                    |
 | Traversal    |                                        |              |                     |
 |              | Breadth First Search (BFS)             | Multi-GPU    | with cutoff support <br/> [C++ README](cpp/src/traversal/README.md#BFS) |
 |              | Single Source Shortest Path (SSSP)     | Multi-GPU    | [C++ README](cpp/src/traversal/README.md#SSSP) |
@@ -97,6 +99,7 @@ _Italic_ algorithms are planned for future releases.
 | Other        |                                        |              |                     |
 |              | Renumbering                            | Multi-GPU    | multiple columns, any data type  |
 |              | Symmetrize                             | Multi-GPU    |                     |
+|              | Path Extraction                        |              | Extract paths from BFS/SSP results in parallel | 
 | Data Generator  |                                     |              |                     |
 |              | RMAT                                   | Multi-GPU    |                     |
 |              | _Barabasi-Albert_                      |  ---         |                     |
@@ -169,6 +172,9 @@ conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=
 
 # CUDA 11.4
 conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.4
+
+# CUDA 11.5
+conda install -c nvidia -c rapidsai -c numba -c conda-forge cugraph cudatoolkit=11.5
 ```
 
 Note: This conda installation only applies to Linux and Python versions 3.7/3.8.

diff --git a/SOURCEBUILD.md b/SOURCEBUILD.md
@@ -52,6 +52,8 @@ conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda11
 # for CUDA 11.4
 conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.4.yml
 
+# for CUDA 11.5
+conda env create --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.5.yml
 
 # activate the environment
 conda activate cugraph_dev
@@ -65,11 +67,8 @@ conda deactivate
 
 ```bash
 
-# for CUDA 11.0
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.0.yml
-
-# for CUDA 11.2
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.2.yml
+# Where XXX is the CUDA 11 version
+conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.XXX.yml
 
 conda activate cugraph_dev
 ```
@@ -90,16 +89,22 @@ There are several other options available on the build script for advanced users
 ```bash
 build.sh [<target> ...] [<flag> ...]
  where <target> is:
-    clean            - remove all existing build artifacts and configuration (start over)
-    libcugraph       - build the cugraph C++ code
-    cugraph          - build the cugraph Python package
-    docs             - build the docs
+   clean            - remove all existing build artifacts and configuration (start over)
+   uninstall        - uninstall libcugraph and cugraph from a prior build/install (see also -n)
+   libcugraph       - build libcugraph.so and SG test binaries
+   libcugraph_etl   - build libcugraph_etl.so and SG test binaries
+   cugraph          - build the cugraph Python package
+   pylibcugraph     - build the pylibcugraph Python package
+   cpp-mgtests      - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
+   docs             - build the docs
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
-   -n               - no install step
+   -n               - do not install after a successful build
    --allgpuarch     - build for all supported GPU architectures
+   --buildfaiss     - build faiss statically into cugraph
    --show_depr_warn - show cmake deprecation warnings
+   --skip_cpp_tests - do not build the SG test binaries as part of the libcugraph and libcugraph_etl targets
    -h               - print this text
 
  default action (no args) is to build and install 'libcugraph' then 'cugraph' then 'docs' targets
@@ -142,24 +147,43 @@ The default installation locations are `$CMAKE_INSTALL_PREFIX/lib` and `$CMAKE_I
 
 ### Building and installing the Python package
 
-2) Install the Python package to your Python path:
+2) Install the Python packages to your Python path:
 
 ```bash
 cd $CUGRAPH_HOME
 cd python
+cd pylibcugraph
+python setup.py build_ext --inplace
+python setup.py install    # install pylibcugraph
+cd ../cugraph
 python setup.py build_ext --inplace
 python setup.py install    # install cugraph python bindings
+
 ```
 
 
 
 ## Run tests
 
+If you already have the datasets:
+
+   ```bash
+   export RAPIDS_DATASET_ROOT_DIR=<path_to_ccp_test_and_reference_data>
+   ```
+   If you do not have the datasets:
+
+   ```bash
+   cd $CUGRAPH_HOME/datasets
+   source get_test_data.sh #This takes about 10 minutes and downloads 1GB data (>5 GB uncompressed)
+   ```
+
 Run either the C++ or the Python tests with datasets
 
   - **Python tests with datasets**
+  
 
     ```bash
+    pip install python-louvain #some tests require this package to run
     cd $CUGRAPH_HOME
     cd python
     pytest
@@ -176,17 +200,7 @@ Run either the C++ or the Python tests with datasets
     ```
  - **C++ tests with larger datasets**
 
-   If you already have the datasets:
-
-   ```bash
-   export RAPIDS_DATASET_ROOT_DIR=<path_to_ccp_test_and_reference_data>
-   ```
-   If you do not have the datasets:
-
-   ```bash
-   cd $CUGRAPH_HOME/datasets
-   source get_test_data.sh #This takes about 10 minutes and downloads 1GB data (>5 GB uncompressed)
-   ```
+   
 
    Run the C++ tests on large input:
 

diff --git a/benchmarks/python_e2e/cugraph_dask_funcs.py b/benchmarks/python_e2e/cugraph_dask_funcs.py
@@ -25,6 +25,8 @@
 from cugraph.generators import rmat
 import tempfile
 
+import rmm
+
 
 def generate_edgelist(scale,
                       edgefactor,
@@ -52,9 +54,9 @@ def generate_edgelist(scale,
     ddf = rmat(
         scale,
         (2**scale)*edgefactor,
-        0.1,
-        0.2,
-        0.3,
+        0.57,  # from Graph500
+        0.19,  # from Graph500
+        0.19,  # from Graph500
         seed or 42,
         clip_and_flip=False,
         scramble_vertex_ids=True,
@@ -153,7 +155,7 @@ def katz(G, alpha=None):
 ################################################################################
 # Session-wide setup and teardown
 
-def setup(dask_scheduler_file=None):
+def setup(dask_scheduler_file=None, rmm_pool_size=None):
     if dask_scheduler_file:
         cluster = None
         # Env var UCX_MAX_RNDV_RAILS=1 must be set too.
@@ -167,7 +169,7 @@ def setup(dask_scheduler_file=None):
 
     else:
         tempdir_object = tempfile.TemporaryDirectory()
-        cluster = LocalCUDACluster(local_directory=tempdir_object.name)
+        cluster = LocalCUDACluster(local_directory=tempdir_object.name, rmm_pool_size=rmm_pool_size)
         client = Client(cluster)
         # add the obj to the client so it doesn't get deleted until
         # the 'client' obj gets cleaned up
@@ -180,7 +182,9 @@ def setup(dask_scheduler_file=None):
 
 def teardown(client, cluster=None):
     Comms.destroy()
-    client.close()
+    # Shutdown the connected scheduler and workers
+    # therefore we will no longer rely on killing the dask cluster ID
+    # for MNMG runs
+    client.shutdown()
     if cluster:
         cluster.close()
-
diff --git a/benchmarks/python_e2e/cugraph_funcs.py b/benchmarks/python_e2e/cugraph_funcs.py
@@ -43,9 +43,9 @@ def generate_edgelist(scale,
     df = rmat(
         scale,
         (2**scale)*edgefactor,
-        0.1,
-        0.2,
-        0.3,
+        0.57,  # from Graph500
+        0.19,  # from Graph500
+        0.19,  # from Graph500
         seed or 42,
         clip_and_flip=False,
         scramble_vertex_ids=True,

diff --git a/benchmarks/python_e2e/main.py b/benchmarks/python_e2e/main.py
@@ -62,7 +62,8 @@ def run(algos,
         symmetric=False,
         edgefactor=None,
         benchmark_dir=None,
-        dask_scheduler_file=None):
+        dask_scheduler_file=None,
+        rmm_pool_size=None):
     """
     Run the nightly benchmark on cugraph.
     Return True on success, False on failure.
@@ -100,7 +101,7 @@ def run(algos,
     # Call the global setup. This is used for setting up Dask, initializing
     # output files/reports, etc.
     log("calling setup...", end="")
-    setup_objs = funcs.setup(dask_scheduler_file)
+    setup_objs = funcs.setup(dask_scheduler_file, rmm_pool_size)
 
     # If the number of GPUs is None, This is a MNMG run
     # Extract the number of gpus from the client
@@ -179,6 +180,9 @@ def run(algos,
                     "(num_edges=num_verts*EDGEFACTOR).")
     ap.add_argument("--benchmark-dir", type=str, default=None,
                     help="directory to store the results in json files")
+    ap.add_argument("--rmm-pool-size", type=str, default=None,
+                    help="RMM pool size to initialize each worker with")
+
 
     args = ap.parse_args()
 
@@ -190,6 +194,7 @@ def run(algos,
                    symmetric=args.symmetric_graph,
                    edgefactor=args.edgefactor,
                    benchmark_dir=args.benchmark_dir,
-                   dask_scheduler_file=args.dask_scheduler_file)
+                   dask_scheduler_file=args.dask_scheduler_file,
+                   rmm_pool_size=args.rmm_pool_size)
 
     sys.exit(exitcode)