Document privacy; benchmark changes

vmware-archive · Feb 4, 2020 · f60e09d · f60e09d
1 parent 3ed7dc0
commit f60e09d
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 49 deletions.
diff --git a/data/ontime_private/gen_metadata.py b/data/ontime_private/gen_metadata.py
@@ -27,6 +27,8 @@ def get_metadata(cn):
         (g, gMin, gMax) = (10, 0, 5100)
     elif cn == "FlightDate":
         # cluster values: (86400000, 852076800000, 1561852800000)
+        # 2017 values: (86400000, 1483286400000, 1514736000000)
+        # 2016 values, 2 months: (86400000, 1451635200000, 1456732800000)
         (g, gMin, gMax) = (86400000, 1451635200000, 1456732800000)
     else:
         raise Exception("Unexpected column " + cn)

diff --git a/platform/src/main/java/org/hillview/utils/Utilities.java b/platform/src/main/java/org/hillview/utils/Utilities.java
@@ -422,17 +422,6 @@ public static void intPairToByteArray(Pair<Integer, Integer> p, /*out*/byte[] ar
         intToByteArray(p.second, arr, startIndex + INT_SIZE);
     }
 
-    public static void intPairPairToByteArray(Pair<Integer, Integer> p1, Pair<Integer, Integer> p2, /*out*/ byte[] arr, int startIndex) {
-        if (arr.length < 2) {
-            throw new RuntimeException("Not enough bytes allocated for output");
-        }
-
-        Arrays.fill(arr, (byte)0);
-
-        intPairToByteArray(p1, arr, startIndex);
-        intPairToByteArray(p2, arr, startIndex + 2*INT_SIZE);
-    }
-
     public static long byteArrayToLong(byte[] bytes) {
         if (bytes.length < 8) {
             throw new RuntimeException("Not enough bytes to convert to int");

diff --git a/privacy.md b/privacy.md
@@ -47,6 +47,19 @@ dataset when using differential privacy.
 
 ## Privacy policy
 
+### Privacy policy location
+
+The root node will treat a dataset as private if it can locate a
+privacy policy file attached to it.  All privacy policy files are
+named `privacy_policy.json`.  The policies are stored as regular files
+on the filesystem of the root node.  For a dataset comprised of files
+(CSV, JSON, ORC, etc.), the privacy policy is located in a directory
+that matches the directory where all the files reside.  For a table
+`table` residing in a JDBC database `database` the policy resides in a
+directory named `database`/`table`.
+
+### File format
+
 The following shows an excerpt from a privacy policy for a table with
 4 columns: `OriginState` (String), `Origin` (String), `DepTime` (Numeric), and
 `DepDelay` (Numeric).
@@ -57,7 +70,7 @@ can be leaked when data in the corresponding set of columns is
 visualized.  In the following example the epsilon for the column
 `Origin` is set to 2, epsilon for zero columns (used when displaying the
 total number of rows) is set to 1, epsilon for any other 1 column is
-set to 1.5, epsilon for any pair of columns is set to 0.1.  (`defaultEpsilons` 
+set to 1.5, epsilon for any pair of columns is set to 0.1.  (`defaultEpsilons`
 maps a column count to a privacy value.)
 
 The privacy policy also specifies for each column a *quantization
@@ -139,12 +152,12 @@ probably exceed the true count almost always.
 
 For private data currently the only charting options are
 histograms/pie charts, heatmaps and Trellis plots of histograms.  This
-functionality can be extended in the future to encompass 
+functionality can be extended in the future to encompass
 2D histograms and the other Trellis plots.
 
 #### Histograms
 
-Histogram plots with private visualizations differ in the following ways from 
+Histogram plots with private visualizations differ in the following ways from
 non-private histograms:
 
 * Counts are displayed as ranges.
@@ -159,15 +172,15 @@ of the number of entries that start with each letter of the alphabet.
 
 #### Pie charts
 
-For pie charts the count and percentage of each slice is shown as a range (however, if 
-all displayed digits of a range are the same -- e.g., 9.3K, 
+For pie charts the count and percentage of each slice is shown as a range (however, if
+all displayed digits of a range are the same -- e.g., 9.3K,
 the range may be displayed as a single value).
 
 #### Heatmaps
 
 For heatmaps the chart will only display the data that has high enough
 confidence (count > threshold * confidence_interval).  The view menu
-offers the ability to change the threshold from it's default value of 2.  
+offers the ability to change the threshold from it's default value of 2.
 
 When mousing-over a cell in a heatmap the value will be displayed
 as a range; simultaneously, the range is shown on the legend above.

diff --git a/web/src/main/java/org/hillview/benchmarks/DPAccuracyBenchmarks.java b/web/src/main/java/org/hillview/benchmarks/DPAccuracyBenchmarks.java
@@ -53,7 +53,6 @@
 import javax.annotation.Nullable;
 import java.io.FileWriter;
 import java.io.IOException;
-import java.sql.SQLException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -204,9 +203,7 @@ private Pair<Double, Double> computeSingleColumnAccuracy(String col, int colInde
         Histogram hist = table.blockingSketch(sk); // Leaf counts.
         Assert.assertNotNull(hist);
 
-        int totalLeaves = dd.getQuantizationIntervalCount();
         TestKeyLoader tkl = new TestKeyLoader();
-
         ArrayList<Double> accuracies = new ArrayList<>();
         double totAccuracy = 0.0;
         for (int i = 0 ; i < iterations; i++) {
@@ -246,10 +243,7 @@ private Pair<Double, Double> computeHeatmapAccuracy(String col1, ColumnQuantizat
         Heatmap heatmap = table.blockingSketch(sk); // Leaf counts.
         Assert.assertNotNull(heatmap);
 
-        int totalXLeaves = d0.getQuantizationIntervalCount();
-        int totalYLeaves = d1.getQuantizationIntervalCount();
         TestKeyLoader tkl = new TestKeyLoader();
-
         ArrayList<Double> accuracies = new ArrayList<>();
         double totAccuracy = 0.0;
         for (int i = 0 ; i < iterations; i++) {
@@ -356,7 +350,7 @@ public void benchmarkHeatmapL2Accuracy() throws IOException {
         writer.close();
     }
 
-    public static void main(String[] args) throws IOException, SQLException {
+    public static void main(String[] args) throws IOException {
         if (args.length < 1) {
             return;
         }

diff --git a/web/src/main/java/org/hillview/benchmarks/DPPerfBenchmarks.java b/web/src/main/java/org/hillview/benchmarks/DPPerfBenchmarks.java
@@ -45,6 +45,7 @@
 import org.hillview.utils.Converters;
 import org.hillview.utils.HillviewLogger;
 import org.hillview.utils.HostList;
+import org.hillview.utils.Utilities;
 
 import javax.annotation.Nullable;
 import java.io.IOException;
@@ -193,14 +194,15 @@ static class ExperimentConfig {
         boolean useRawData;
         boolean usePostProcessing;
         Dataset dataset;
+        int bucketCount = DPPerfBenchmarks.buckets;
         int machines = 1;
 
         public String toString() {
             String result = "";
             result += this.dataset.toString();
             result += this.useRawData ? " public" : " quantized";
             result += this.usePostProcessing ? " noised" : "";
-            result += "," + this.machines;
+            result += "," + this.machines + "," + this.bucketCount;
             return result;
         }
     }
@@ -224,13 +226,15 @@ private ColumnConfig getColumnConfig(ColumnDescription col, ExperimentConfig con
         if (col.kind.isNumeric() || col.kind == ContentsKind.Date) {
             DoubleColumnQuantization dq = (DoubleColumnQuantization)q;
             assert dq != null;
-            DoubleHistogramBuckets b = new DoubleHistogramBuckets(dq.globalMin, dq.globalMax, buckets);
+            DoubleHistogramBuckets b = new DoubleHistogramBuckets(dq.globalMin, dq.globalMax, conf.bucketCount);
             result.buckets = b;
             result.decomposition = new NumericIntervalDecomposition(dq, b);
         } else if (col.kind.isString()) {
             StringColumnQuantization sq = (StringColumnQuantization)q;
             assert sq != null;
-            StringHistogramBuckets b = new StringHistogramBuckets(sq.leftBoundaries);
+            List<String> labels = new ArrayList<String>();
+            Utilities.equiSpaced(Arrays.asList(sq.leftBoundaries), conf.bucketCount, labels);
+            StringHistogramBuckets b = new StringHistogramBuckets(labels.toArray(new String[0]));
             result.buckets = b;
             result.decomposition =  new StringIntervalDecomposition(sq, b);
         } else {
@@ -311,10 +315,32 @@ private void benchmarkHistogram(ExperimentConfig conf, ColumnDescription col) {
         }
     }
 
+    private void allHistograms(ColumnDescription col, ExperimentConfig conf) {
+        conf.useRawData = true;
+        conf.usePostProcessing = false;
+        this.benchmarkHistogram(conf, col);
+        conf.useRawData = false;
+        this.benchmarkHistogram(conf, col);
+        conf.usePostProcessing = true;
+        this.benchmarkHistogram(conf, col);
+    }
+
+    private void allHeatmaps(ColumnDescription col0, ColumnDescription col1, ExperimentConfig conf) {
+        conf.useRawData = true;
+        conf.usePostProcessing = false;
+        this.benchmarkHeatmap(conf, col0, col1);
+        conf.useRawData = false;
+        this.benchmarkHeatmap(conf, col0, col1);
+        conf.usePostProcessing = true;
+        this.benchmarkHeatmap(conf, col0, col1);
+    }
+
     public void run(HashSet<String> datasets) {
         assert this.ontimeSchema != null;
         ExperimentConfig conf = new ExperimentConfig();
-        System.out.println("Measurement,Column(s),Type,Machines,Iteration,Time (ms)");
+        System.out.println("Type,Column(s),Measurements,Machines,Bucket ct,Iteration,Time (ms)");
+        List<ColumnDescription> cols = this.ontimeSchema.getColumnDescriptions();
+
         for (Dataset d: Arrays.asList(Dataset.Cloud, Dataset.Local, Dataset.DB)) {
             if (!datasets.contains(d.toString()))
                 continue;
@@ -325,30 +351,50 @@ public void run(HashSet<String> datasets) {
                 assert this.cloudFlights != null;
                 machines.addAll(this.cloudFlights.keySet());
             } else {
+                // On local datasets this will always have 1 machine
                 machines.add(1);
             }
-            for (int m: machines) {
-                conf.machines = m;
-                List<ColumnDescription> cols = this.ontimeSchema.getColumnDescriptions();
-                for (ColumnDescription col : cols) {
-                    conf.useRawData = true;
-                    conf.usePostProcessing = false;
-                    this.benchmarkHistogram(conf, col);
-                    conf.useRawData = false;
-                    this.benchmarkHistogram(conf, col);
-                    conf.usePostProcessing = true;
-                    this.benchmarkHistogram(conf, col);
+            if (false) {
+                // Vary the columns
+                for (int m : machines) {
+                    conf.machines = m;
+                    for (ColumnDescription col : cols) {
+                        this.allHistograms(col, conf);
+                    }
+                    for (int i = 0; i < cols.size() - 1; i++) {
+                        ColumnDescription col0 = cols.get(i);
+                        ColumnDescription col1 = cols.get(i + 1);
+                        this.allHeatmaps(col0, col1, conf);
+                    }
                 }
-                for (int i = 0; i < cols.size() - 1; i++) {
-                    ColumnDescription col0 = cols.get(i);
-                    ColumnDescription col1 = cols.get(i + 1);
-                    conf.useRawData = true;
-                    conf.usePostProcessing = false;
-                    this.benchmarkHeatmap(conf, col0, col1);
-                    conf.useRawData = false;
-                    this.benchmarkHeatmap(conf, col0, col1);
-                    conf.usePostProcessing = true;
-                    this.benchmarkHeatmap(conf, col0, col1);
+            } else if (false ){
+                // Vary number of buckets for some columns
+                if (d.equals(Dataset.Local)) {
+                    ColumnDescription col = this.ontimeSchema.getDescription("FlightDate");
+                    ColumnDescription col1 = this.ontimeSchema.getDescription("OriginState");
+                    for (int buckets = 1; buckets < 1025; buckets *= 2) {
+                        conf.bucketCount = buckets;
+                        this.allHistograms(col, conf);
+                    }
+
+                    for (int buckets = 1; buckets < 1025; buckets *= 2) {
+                        conf.bucketCount = buckets;
+                        this.allHeatmaps(col, col1, conf);
+                    }
+                }
+            } else {
+                // vary quantization intervals
+                if (d.equals(Dataset.Local)) {
+                    ColumnDescription col = this.ontimeSchema.getDescription("DepTime");
+                    int[] granularity = { 1, 2, 5, 10, 20, 100 };
+                    PrivacySchema ps = this.flightsWrapper.getPrivacySchema();
+                    DoubleColumnQuantization q = (DoubleColumnQuantization)ps.quantization(col.name);
+
+                    for (int i: granularity) {
+                        conf.machines = i; // that's a lie
+                        ps.quantization.set(col.name, new DoubleColumnQuantization(i, q.globalMin, q.globalMax));
+                        this.allHistograms(col, conf);
+                    }
                 }
             }
         }