Update PipelineDP4j artifacts

READMEs and examples for PipelineDP4j, add it to GitHub Actions and fix Maven artifact generation. Change-Id: Ibe52784c64bb6be18d503e15ae03efbdbf0ae327 GitOrigin-RevId: 63505a954f03a1cf7f276aecc4c7b7ab2f9c434f
google · Oct 30, 2024 · 4caf604 · 4caf604
1 parent 3328603
commit 4caf604
Show file tree

Hide file tree

Showing 7 changed files with 212 additions and 36 deletions.
diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml
@@ -95,6 +95,29 @@ jobs:
         working-directory: privacy-on-beam
         run: bazelisk test --test_timeout_filters=-eternal ...
 
+  pipelinedp4j-tests:
+    name: PipelineDP4J Bazel Tests
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+    steps:
+      - uses: actions/checkout@v3
+      - name: Mount Cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/bazel/
+          key: bazel-pipelinedp4j-cache
+      - name: Ensure bazelisk is installed
+        run: bazelisk --version
+      - name: Build PipelineDP4J Workspace
+        working-directory: pipelinedp4j
+        run: bazelisk build ...
+      - name: Test PipelineDP4J Workspace (without long and eternal tests)
+        working-directory: pipelinedp4j
+        run: bazelisk test --test_timeout_filters=-long,-eternal ...
+      - name: Build PipelineDP4J Example
+        working-directory: examples/pipelinedp4j
+        run: bazelisk build ...
+
   zetasql-build:
     name: ZetaSQL Examples Build Test
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -9,6 +9,12 @@ private statistics over datasets. It contains the following tools.
 * [Privacy on Beam](privacy-on-beam) is an end-to-end differential privacy
   framework built on top of [Apache Beam](https://beam.apache.org/documentation/).
   It is intended to be easy to use, even by non-experts.
+* [PipelineDP4j](pipelinedp4j) is an end-to-end differential privacy framework
+  for JVM languages (Java, Kotlin, Scala). It supports different data
+  processing frameworks such as
+  [Apache Beam](https://beam.apache.org/documentation/) and
+  [Apache Spark](https://spark.apache.org/) (coming soon). It is intended to
+  be easy to use, even by non-experts.
 * Three "DP building block" libraries, in [C++](cc), [Go](go), and [Java](java).
   These libraries implement basic noise addition primitives and differentially
   private aggregations. Privacy on Beam is implemented using these libraries.
@@ -89,6 +95,12 @@ cd java
 bazel build ...
 ```
 
+To build the PipelineDP4j library, run:
+```shell
+cd pipelinedp4j
+bazel build ...
+```
+
 To build Privacy on Beam, run:
 ```shell
 cd privacy-on-beam

diff --git a/examples/pipelinedp4j/BeamExample.java b/examples/pipelinedp4j/BeamExample.java
@@ -95,18 +95,19 @@ public void run() {
 
     // Define the query
     var query =
-        QueryBuilder.from(data, new UserIdExtractor())
+        QueryBuilder.from(data, /* privacyIdExtractor= */ new UserIdExtractor())
             .groupBy(
                 /* groupKeyExtractor= */ new MovieIdExtractor(),
                 /* maxGroupsContributed= */ 3,
                 /* maxContributionsPerGroup= */ 1,
                 usePublicGroups ? publiclyKnownMovieIds(pipeline) : null)
+            .countDistinctPrivacyUnits("numberOfViewers")
             .count(/* outputColumnName= */ "numberOfViews")
-            .sum(
+            .mean(
                 new RatingExtractor(),
-                /* minTotalValuePerPrivacyUnitInGroup= */ 1.0,
-                /* maxTotalValuePerPrivacyUnitInGroup= */ 5.0,
-                /* outputColumnName= */ "sumOfRatings",
+                /* minValue= */ 1.0,
+                /* maxValue= */ 5.0,
+                /* outputColumnName= */ "averageOfRatings",
                 /* budget= */ null)
             .build();
     // Run the query with DP parameters.
@@ -118,9 +119,11 @@ public void run() {
     SerializableFunction<QueryPerGroupResult, MovieMetrics> mapToMovieMetricsFn =
         perGroupResult -> {
           String movieId = perGroupResult.getGroupKey();
+          long numberOfViewers =
+              round(perGroupResult.getAggregationResults().get("numberOfViewers"));
           long numberOfViews = round(perGroupResult.getAggregationResults().get("numberOfViews"));
-          long sumOfRatings = round(perGroupResult.getAggregationResults().get("sumOfRatings"));
-          return new MovieMetrics(movieId, numberOfViews, sumOfRatings);
+          double averageOfRatings = perGroupResult.getAggregationResults().get("averageOfRatings");
+          return new MovieMetrics(movieId, numberOfViewers, numberOfViews, averageOfRatings);
         };
     // We now have our anonymized metrics of movie views.
     PCollection<MovieMetrics> anonymizedMovieMetrics =

diff --git a/examples/pipelinedp4j/MovieMetrics.java b/examples/pipelinedp4j/MovieMetrics.java
@@ -23,23 +23,28 @@
  */
 final class MovieMetrics {
   private final String movieId;
+
+  private final long numberOfViewers;
   private final long numberOfViews;
-  private final long sumOfRatings;
 
-  MovieMetrics(String movieId, long numberOfViews, long sumOfRatings) {
+  private final double averageOfRatings;
+
+  MovieMetrics(String movieId, long numberOfViewers, long numberOfViews, double averageOfRatings) {
     this.movieId = movieId;
+    this.numberOfViewers = numberOfViewers;
     this.numberOfViews = numberOfViews;
-    this.sumOfRatings = sumOfRatings;
+    this.averageOfRatings = averageOfRatings;
   }
 
   // 0-arg constructor is necessary for serialization to work.
   private MovieMetrics() {
-    this("", 0, 0);
+    this("", 0, 0, 0.0);
   }
 
   @Override
   public String toString() {
     return String.format(
-        "movieId=%s, numberOfViews=%s, sumOfRatings=%s", movieId, numberOfViews, sumOfRatings);
+        "movieId=%s, numberOfViewers=%s, numberOfViews=%s, averageOfRatings=%s",
+        movieId, numberOfViewers, numberOfViews, averageOfRatings);
   }
 }
diff --git a/examples/pipelinedp4j/README.md b/examples/pipelinedp4j/README.md
@@ -1,3 +1,7 @@
+# Running code walkthrough of BeamExample.
+
+## Running
+
 This example demonstrates how to compute differentially private statistics on a
 [Netflix dataset](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data).
 To speed up calculations, we'll use a smaller sample of the full dataset.
@@ -8,7 +12,6 @@ The example code expects a CSV file in the following format: `movie_id`,
 Using this data, the library computes these statistics:
 
 *   Number of views of a certain movie (`count` metric)
-*   Sum of all ratings of a certain movie (`sum` metric)
 *   Number of users who watched a certain movie (`privacy_id_count` metric)
 *   Average rating of a certain movie (`mean` metric)
 
@@ -61,3 +64,93 @@ Here's are the steps to run the example:
     ```shell
     cat output.txt
     ```
+
+## Code walkthrough
+Let's deep into details how code for computing DP statistics is organized.
+
+Warning: this API is experimental and will change in 2025 without backward 
+compatibility. The new version API released in 2024 will be long-term supported.
+
+### Key definitions:
+
+- **(Privacy) budget**: every operation leaks some information about individuals. The total privacy cost of a pipeline is the sum of the costs of calculated statistics. You want this to be below a certain total cost. That's your budget. Typically, the greek letters 'epsilon' and 'delta' (&epsilon; and &delta;) are used to define the budget.
+Bigger epsilon => more budget => less privacy.
+
+- **Group:** a group is a subset of the data corresponding to a given value of the aggregation criterion. In our example, the groups are movies.
+
+- **Group key:** this is the group identifier. Since in our example the data are aggregated per movie, the group key is a movie_id.
+
+- **A privacy unit** is an entity that we’re trying to protect with differential privacy. Often, this refers to a single individual. An example of a more complex privacy unit is a person+restaurant pair, which protects all visits by an individual to a particular restaurant or, in other words, the fact that a particular person visited any particular restaurant.
+
+- **Privacy ID:** an ID of the unit of privacy that we are protecting. For example, if we protect the presence of the user in a dataset, the privacy ID is the user ID. In this example, the privacy ID is a user ID who watched a movie.
+
+- **Contribution bounding** is a process of limiting contributions by a single individual (or an entity represented by a privacy key) to the output dataset or its partition. This is key for DP algorithms, since protecting unbounded contributions would require adding infinite noise.
+
+- **Group selection** is a process of identifying the partition keys that are safe to release in the sense that they don’t break the DP guarantees and don’t leak any user information.
+
+- **Public groups** are partition keys that are publicly known and hence don’t leak any user information.
+In our case we will use public groups since our groups are movies and they are publicly known.
+
+### Reading and pre-processing data
+We need to read and preprocess data to `PCollection`, such that we can extract from records Privacy Id, Group Key and Values to aggregate.
+In the example that is encapsulated in the `readData` function.
+
+```java
+PCollection<MovieView> data = readData(pipeline);
+```
+
+### Create DP query
+
+By creating DP query, we specify what DP operation on what data should be computed.
+
+In PipelineDP4j semantics of data is specified with data_extractors, functions that take single dataset record and return corresponding object.
+There are 3 types of extractors: privacyIdExtractor, groupKeyExtractor, valueExtractor.
+
+```java
+    var query =
+        QueryBuilder.from(data, /* privacyIdExtractor= */ new UserIdExtractor())
+            .groupBy(
+                /* groupKeyExtractor= */ new MovieIdExtractor(),
+                /* maxGroupsContributed= */ 3,
+                /* maxContributionsPerGroup= */ 1,
+                usePublicGroups ? publiclyKnownMovieIds(pipeline) : null)
+            .countDistinctPrivacyUnits("numberOfViewers")
+            .count(/* outputColumnName= */ "numberOfViews")
+            .mean(
+                new RatingExtractor(),
+                /* minValue= */ 1.0,
+                /* maxValue= */ 5.0,
+                /* outputColumnName= */ "averageOfRatings",
+                /* budget= */ null)
+            .build();
+```
+
+Building of query is similar to writing SQL query. It consists:
+
+1. Create `QueryBuilder` from data and privacyIdExtractor.
+
+1. Call `groupBy`: specify group by key, setting contribution bounding parameters and setting public groups if any.
+If no public group are specified, groups will be determined with the DP group selection procedure.
+
+1. Specify aggregations to compute (`count`, `countDistinctPrivacyUnits`, `sum`, `mean`, `variance`).
+For non-count aggregation it's required to specify a value (with valueExtractor) to aggregate.
+
+1. Finish building with call `.build()`.
+
+Note that optionally it's possible to specify a DP budget per aggregation or
+per `groupBy`. If the budget is not specified, the total budget will be split evenly among all
+aggregations.
+
+### Run query
+
+On the running of the query we specify the total (&epsilon;, &delta;)$-DP budget and DP mechanism to apply (Laplace mechanism in this case).
+
+```java
+    PCollection<QueryPerGroupResult> result =
+        query.run(new TotalBudget(/* epsilon= */ 1.1, /* delta= */ 1e-10), NoiseKind.LAPLACE);
+
+```
+
+### Saving results
+Differential privacy has a nice property to be safe under post-processing.
+So it's ok to do any post-processing of the output.
diff --git a/pipelinedp4j/BUILD.bazel b/pipelinedp4j/BUILD.bazel
@@ -35,8 +35,31 @@ _RELEASE_VERSION = "0.0.1"
 pom_file(
     name = "export_pom",
     substitutions = {"RELEASE_VERSION": _RELEASE_VERSION},
+    # Generate this list via `bazelisk query //main/...`
     targets = [
-        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/api",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/api:api",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_collections",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_dp_engine_factory",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/beam:beam_encoders",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:contribution_sampler",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:core_types",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:data_extractors",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_engine",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:dp_functions_params",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:encoders",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core:framework_collections",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:allocated_budget",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_accountant",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/core/budget:budget_spec",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary:noise_factories",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/dplibrary:pre_aggregation_partition_selection_factory",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_collections",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_dp_engine_factory",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/local:local_encoders",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:accumulators_kt_proto",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:accumulators_proto",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:dpaggregates_kt_proto",
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/proto:dpaggregates_proto",
     ],
     template_file = "pom.template",
 )
@@ -46,4 +69,7 @@ kt_jvm_export(
     srcs = glob(["*.kt"]),
     maven_coordinates = "com.google.privacy.differentialprivacy.pipelinedp4j:pipelinedp4j:%s" % _RELEASE_VERSION,
     pom_template = ":export_pom",
+    runtime_deps = [
+        "//main/com/google/privacy/differentialprivacy/pipelinedp4j/api",
+    ],
 )
diff --git a/pipelinedp4j/README.md b/pipelinedp4j/README.md
@@ -1,34 +1,48 @@
-<!-- TODO: revise all pipelinedp4j related readmes (this one,
-google internal and root differential privacy package readme). -->
-
 # PipelineDP4j
 
-PipelineDP4j is an end-to-end differential privacy solution for JVM that supports various frameworks for distributed data processing such as [Apache Spark](https://spark.apache.org/) and
-[Apache Beam](https://beam.apache.org/documentation/).
-It is intended to be usable by all developers, regardless of their differential
-privacy expertise.
+PipelineDP4j is an end-to-end differential privacy solution for JVM that
+supports various frameworks for distributed data processing such as
+[Apache Beam](https://beam.apache.org/documentation/) and
+[Apache Spark](https://spark.apache.org/) (coming soon). It is intended to be
+usable by all developers, regardless of their differential privacy expertise.
 
 Internally, PipelineDP4j relies on the lower-level building blocks from the
 differential privacy library and combines them into an "out-of-the-box" solution
 that takes care of all the steps that are essential to differential privacy,
-including noise addition, [partition selection](https://arxiv.org/abs/2006.03684),
-and contribution bounding. Thus, rather than using the lower-level differential
-privacy library, it is recommended to use PipelineDP4j, as it can reduce
-implementation mistakes.
+including noise addition,
+[partition selection](https://arxiv.org/abs/2006.03684), and contribution
+bounding. Thus, rather than using the lower-level differential privacy library,
+it is recommended to use PipelineDP4j, as it can reduce implementation mistakes.
 
-PipelineDP4j can be used on any JVM using any JVM compatible language like Kotlin, Scala or Java.
+You can use PipelineDP4j in Java, Kotlin or Scala.
 
 ## How to Use
 
-<!-- TODO: create codelab and check links. -->
-Our [codelab](https://codelabs.developers.google.com/codelabs/pipelinedp4j/)
-about computing private statistics with PipelineDP4j
-demonstrates how to use the library. Source code for the codelab is available in
-the [codelab/](codelab)
-directory.
+WARNING: Current API version (0.0.1) is experimental and will be changed in 2024
+without backward-compatibility. The experimental API won't be supported and
+maintained after that.
+
+### Example
+
+<!-- TODO: create codelab and rewrite this section. -->
+<!-- TODO: generate kDoc of API using Dokka and GitHub pages. -->
+
+Familiarize yourself with an
+[example](https://github.com/google/differential-privacy/tree/main/examples/pipelinedp4j).
+It shows how to compute differentially private statistics on a real dataset
+using the library.
+
+The public API of the library is located in the
+[API package](https://github.com/google/differential-privacy/tree/main/pipelinedp4j/main/com/google/privacy/differentialprivacy/pipelinedp4j/api).
+You can look at it if you need something beyond the example.
+
+### Use the library from Maven repository
 
-<!-- TODO: insert link. -->
-Full documentation of the API is available as [kdoc]().
+The easiest way to start using the library in your project is to use the
+dependency from Maven repository. You can find it
+[here](https://mvnrepository.com/artifact/com.google.privacy.differentialprivacy/pipelinedp4j).
+After adding this dependency into your project you can write the same code as in
+the example above and it will compile.
 
-## Using with Bazel
-<!-- TODO: describe how to build. -->
+Please, don't use `0.0.1` version in production code as it is experimental and
+its maintenance will be stopped in 2024 with release of the new version.