diff --git a/data/ontime_private/gen_metadata.py b/data/ontime_private/gen_metadata.py index bcbe5fc11..f1ae5b8fc 100755 --- a/data/ontime_private/gen_metadata.py +++ b/data/ontime_private/gen_metadata.py @@ -27,6 +27,8 @@ def get_metadata(cn): (g, gMin, gMax) = (10, 0, 5100) elif cn == "FlightDate": # cluster values: (86400000, 852076800000, 1561852800000) + # 2017 values: (86400000, 1483286400000, 1514736000000) + # 2016 values, 2 months: (86400000, 1451635200000, 1456732800000) (g, gMin, gMax) = (86400000, 1451635200000, 1456732800000) else: raise Exception("Unexpected column " + cn) diff --git a/platform/src/main/java/org/hillview/utils/Utilities.java b/platform/src/main/java/org/hillview/utils/Utilities.java index a3b98b220..5fe988fc0 100644 --- a/platform/src/main/java/org/hillview/utils/Utilities.java +++ b/platform/src/main/java/org/hillview/utils/Utilities.java @@ -422,17 +422,6 @@ public static void intPairToByteArray(Pair p, /*out*/byte[] ar intToByteArray(p.second, arr, startIndex + INT_SIZE); } - public static void intPairPairToByteArray(Pair p1, Pair p2, /*out*/ byte[] arr, int startIndex) { - if (arr.length < 2) { - throw new RuntimeException("Not enough bytes allocated for output"); - } - - Arrays.fill(arr, (byte)0); - - intPairToByteArray(p1, arr, startIndex); - intPairToByteArray(p2, arr, startIndex + 2*INT_SIZE); - } - public static long byteArrayToLong(byte[] bytes) { if (bytes.length < 8) { throw new RuntimeException("Not enough bytes to convert to int"); diff --git a/privacy.md b/privacy.md index 021470816..05b79739d 100644 --- a/privacy.md +++ b/privacy.md @@ -47,6 +47,19 @@ dataset when using differential privacy. ## Privacy policy +### Privacy policy location + +The root node will treat a dataset as private if it can locate a +privacy policy file attached to it. All privacy policy files are +named `privacy_policy.json`. The policies are stored as regular files +on the filesystem of the root node. For a dataset comprised of files +(CSV, JSON, ORC, etc.), the privacy policy is located in a directory +that matches the directory where all the files reside. For a table +`table` residing in a JDBC database `database` the policy resides in a +directory named `database`/`table`. + +### File format + The following shows an excerpt from a privacy policy for a table with 4 columns: `OriginState` (String), `Origin` (String), `DepTime` (Numeric), and `DepDelay` (Numeric). @@ -57,7 +70,7 @@ can be leaked when data in the corresponding set of columns is visualized. In the following example the epsilon for the column `Origin` is set to 2, epsilon for zero columns (used when displaying the total number of rows) is set to 1, epsilon for any other 1 column is -set to 1.5, epsilon for any pair of columns is set to 0.1. (`defaultEpsilons` +set to 1.5, epsilon for any pair of columns is set to 0.1. (`defaultEpsilons` maps a column count to a privacy value.) The privacy policy also specifies for each column a *quantization @@ -139,12 +152,12 @@ probably exceed the true count almost always. For private data currently the only charting options are histograms/pie charts, heatmaps and Trellis plots of histograms. This -functionality can be extended in the future to encompass +functionality can be extended in the future to encompass 2D histograms and the other Trellis plots. #### Histograms -Histogram plots with private visualizations differ in the following ways from +Histogram plots with private visualizations differ in the following ways from non-private histograms: * Counts are displayed as ranges. @@ -159,15 +172,15 @@ of the number of entries that start with each letter of the alphabet. #### Pie charts -For pie charts the count and percentage of each slice is shown as a range (however, if -all displayed digits of a range are the same -- e.g., 9.3K, +For pie charts the count and percentage of each slice is shown as a range (however, if +all displayed digits of a range are the same -- e.g., 9.3K, the range may be displayed as a single value). #### Heatmaps For heatmaps the chart will only display the data that has high enough confidence (count > threshold * confidence_interval). The view menu -offers the ability to change the threshold from it's default value of 2. +offers the ability to change the threshold from it's default value of 2. When mousing-over a cell in a heatmap the value will be displayed as a range; simultaneously, the range is shown on the legend above. diff --git a/web/src/main/java/org/hillview/benchmarks/DPAccuracyBenchmarks.java b/web/src/main/java/org/hillview/benchmarks/DPAccuracyBenchmarks.java index cb6dbac63..7adc8004f 100644 --- a/web/src/main/java/org/hillview/benchmarks/DPAccuracyBenchmarks.java +++ b/web/src/main/java/org/hillview/benchmarks/DPAccuracyBenchmarks.java @@ -53,7 +53,6 @@ import javax.annotation.Nullable; import java.io.FileWriter; import java.io.IOException; -import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -204,9 +203,7 @@ private Pair computeSingleColumnAccuracy(String col, int colInde Histogram hist = table.blockingSketch(sk); // Leaf counts. Assert.assertNotNull(hist); - int totalLeaves = dd.getQuantizationIntervalCount(); TestKeyLoader tkl = new TestKeyLoader(); - ArrayList accuracies = new ArrayList<>(); double totAccuracy = 0.0; for (int i = 0 ; i < iterations; i++) { @@ -246,10 +243,7 @@ private Pair computeHeatmapAccuracy(String col1, ColumnQuantizat Heatmap heatmap = table.blockingSketch(sk); // Leaf counts. Assert.assertNotNull(heatmap); - int totalXLeaves = d0.getQuantizationIntervalCount(); - int totalYLeaves = d1.getQuantizationIntervalCount(); TestKeyLoader tkl = new TestKeyLoader(); - ArrayList accuracies = new ArrayList<>(); double totAccuracy = 0.0; for (int i = 0 ; i < iterations; i++) { @@ -356,7 +350,7 @@ public void benchmarkHeatmapL2Accuracy() throws IOException { writer.close(); } - public static void main(String[] args) throws IOException, SQLException { + public static void main(String[] args) throws IOException { if (args.length < 1) { return; } diff --git a/web/src/main/java/org/hillview/benchmarks/DPPerfBenchmarks.java b/web/src/main/java/org/hillview/benchmarks/DPPerfBenchmarks.java index be4334eb7..d1548ca11 100644 --- a/web/src/main/java/org/hillview/benchmarks/DPPerfBenchmarks.java +++ b/web/src/main/java/org/hillview/benchmarks/DPPerfBenchmarks.java @@ -45,6 +45,7 @@ import org.hillview.utils.Converters; import org.hillview.utils.HillviewLogger; import org.hillview.utils.HostList; +import org.hillview.utils.Utilities; import javax.annotation.Nullable; import java.io.IOException; @@ -193,6 +194,7 @@ static class ExperimentConfig { boolean useRawData; boolean usePostProcessing; Dataset dataset; + int bucketCount = DPPerfBenchmarks.buckets; int machines = 1; public String toString() { @@ -200,7 +202,7 @@ public String toString() { result += this.dataset.toString(); result += this.useRawData ? " public" : " quantized"; result += this.usePostProcessing ? " noised" : ""; - result += "," + this.machines; + result += "," + this.machines + "," + this.bucketCount; return result; } } @@ -224,13 +226,15 @@ private ColumnConfig getColumnConfig(ColumnDescription col, ExperimentConfig con if (col.kind.isNumeric() || col.kind == ContentsKind.Date) { DoubleColumnQuantization dq = (DoubleColumnQuantization)q; assert dq != null; - DoubleHistogramBuckets b = new DoubleHistogramBuckets(dq.globalMin, dq.globalMax, buckets); + DoubleHistogramBuckets b = new DoubleHistogramBuckets(dq.globalMin, dq.globalMax, conf.bucketCount); result.buckets = b; result.decomposition = new NumericIntervalDecomposition(dq, b); } else if (col.kind.isString()) { StringColumnQuantization sq = (StringColumnQuantization)q; assert sq != null; - StringHistogramBuckets b = new StringHistogramBuckets(sq.leftBoundaries); + List labels = new ArrayList(); + Utilities.equiSpaced(Arrays.asList(sq.leftBoundaries), conf.bucketCount, labels); + StringHistogramBuckets b = new StringHistogramBuckets(labels.toArray(new String[0])); result.buckets = b; result.decomposition = new StringIntervalDecomposition(sq, b); } else { @@ -311,10 +315,32 @@ private void benchmarkHistogram(ExperimentConfig conf, ColumnDescription col) { } } + private void allHistograms(ColumnDescription col, ExperimentConfig conf) { + conf.useRawData = true; + conf.usePostProcessing = false; + this.benchmarkHistogram(conf, col); + conf.useRawData = false; + this.benchmarkHistogram(conf, col); + conf.usePostProcessing = true; + this.benchmarkHistogram(conf, col); + } + + private void allHeatmaps(ColumnDescription col0, ColumnDescription col1, ExperimentConfig conf) { + conf.useRawData = true; + conf.usePostProcessing = false; + this.benchmarkHeatmap(conf, col0, col1); + conf.useRawData = false; + this.benchmarkHeatmap(conf, col0, col1); + conf.usePostProcessing = true; + this.benchmarkHeatmap(conf, col0, col1); + } + public void run(HashSet datasets) { assert this.ontimeSchema != null; ExperimentConfig conf = new ExperimentConfig(); - System.out.println("Measurement,Column(s),Type,Machines,Iteration,Time (ms)"); + System.out.println("Type,Column(s),Measurements,Machines,Bucket ct,Iteration,Time (ms)"); + List cols = this.ontimeSchema.getColumnDescriptions(); + for (Dataset d: Arrays.asList(Dataset.Cloud, Dataset.Local, Dataset.DB)) { if (!datasets.contains(d.toString())) continue; @@ -325,30 +351,50 @@ public void run(HashSet datasets) { assert this.cloudFlights != null; machines.addAll(this.cloudFlights.keySet()); } else { + // On local datasets this will always have 1 machine machines.add(1); } - for (int m: machines) { - conf.machines = m; - List cols = this.ontimeSchema.getColumnDescriptions(); - for (ColumnDescription col : cols) { - conf.useRawData = true; - conf.usePostProcessing = false; - this.benchmarkHistogram(conf, col); - conf.useRawData = false; - this.benchmarkHistogram(conf, col); - conf.usePostProcessing = true; - this.benchmarkHistogram(conf, col); + if (false) { + // Vary the columns + for (int m : machines) { + conf.machines = m; + for (ColumnDescription col : cols) { + this.allHistograms(col, conf); + } + for (int i = 0; i < cols.size() - 1; i++) { + ColumnDescription col0 = cols.get(i); + ColumnDescription col1 = cols.get(i + 1); + this.allHeatmaps(col0, col1, conf); + } } - for (int i = 0; i < cols.size() - 1; i++) { - ColumnDescription col0 = cols.get(i); - ColumnDescription col1 = cols.get(i + 1); - conf.useRawData = true; - conf.usePostProcessing = false; - this.benchmarkHeatmap(conf, col0, col1); - conf.useRawData = false; - this.benchmarkHeatmap(conf, col0, col1); - conf.usePostProcessing = true; - this.benchmarkHeatmap(conf, col0, col1); + } else if (false ){ + // Vary number of buckets for some columns + if (d.equals(Dataset.Local)) { + ColumnDescription col = this.ontimeSchema.getDescription("FlightDate"); + ColumnDescription col1 = this.ontimeSchema.getDescription("OriginState"); + for (int buckets = 1; buckets < 1025; buckets *= 2) { + conf.bucketCount = buckets; + this.allHistograms(col, conf); + } + + for (int buckets = 1; buckets < 1025; buckets *= 2) { + conf.bucketCount = buckets; + this.allHeatmaps(col, col1, conf); + } + } + } else { + // vary quantization intervals + if (d.equals(Dataset.Local)) { + ColumnDescription col = this.ontimeSchema.getDescription("DepTime"); + int[] granularity = { 1, 2, 5, 10, 20, 100 }; + PrivacySchema ps = this.flightsWrapper.getPrivacySchema(); + DoubleColumnQuantization q = (DoubleColumnQuantization)ps.quantization(col.name); + + for (int i: granularity) { + conf.machines = i; // that's a lie + ps.quantization.set(col.name, new DoubleColumnQuantization(i, q.globalMin, q.globalMax)); + this.allHistograms(col, conf); + } } } }