Skip to content
This repository has been archived by the owner on Aug 30, 2022. It is now read-only.

Commit

Permalink
Configuration; greenplum extraction via columns (turned off) (#693)
Browse files Browse the repository at this point in the history
* Column-oriented data extraction from greenplum (does not work)
* Configurable demo datasets menu
* Locate geographic metadata for Greenplum data
  • Loading branch information
Mihai Budiu authored Sep 18, 2020
1 parent 0b7e133 commit 5bee207
Show file tree
Hide file tree
Showing 31 changed files with 771 additions and 206 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ repository/*.jar

# data which is too big to put into git
data/ontime/On_Time_On_Time*
data/ontime_big/*.gz
data/ontime/2016_*.csv
data/ontime/*.orc
data/ontime_orc/*
Expand Down
13 changes: 8 additions & 5 deletions bin/deploy-greenplum.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from argparse import ArgumentParser
from jproperties import Properties
import os
import tempfile
from hillviewCommon import ClusterConfiguration, get_config, get_logger, execute_command

def main():
Expand All @@ -34,8 +35,8 @@ def main():
execute_command("./package-binaries.sh")
web = config.get_webserver()
web.copy_file_to_remote("../hillview-bin.zip", ".", "")
web.copy_file_to_remote("config-greenplum.json", ".", "")
web.run_remote_shell_command("unzip -o hillview-bin.zip")
web.copy_file_to_remote("config-greenplum.json", "bin", "")
web.run_remote_shell_command("cd bin; ./upload-data.py -d . -s dump-greenplum.sh config-greenplum.json")
web.run_remote_shell_command("cd bin; ./redeploy.sh -s config-greenplum.json")
web.copy_file_to_remote("../repository/PROGRESS_DATADIRECT_JDBC_DRIVER_PIVOTAL_GREENPLUM_5.1.4.000275.jar",
Expand All @@ -45,10 +46,12 @@ def main():
p = Properties()
p.load(f, "utf-8")
p["greenplumDumpScript"] = config.service_folder + "/dump-greenplum.sh"
with open("hillview.properties", "wb") as f:
p.store(f, encoding="utf-8")
web.copy_file_to_remote("hillview.properties", config.service_folder, "")
os.remove("hillview.properties")
p["hideDemoMenu"] = "true"
tmp = tempfile.NamedTemporaryFile(mode="w", delete=False)
p.store(tmp, encoding="utf-8")
tmp.close()
web.copy_file_to_remote(tmp.name, config.service_folder + "/hillview.properties", "")
os.remove(tmp.name)

if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion bin/dump-greenplum.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@
DIR=$1
PREFIX="file"
mkdir -p ${DIR} || exit 1
echo "$(</dev/stdin)" >${DIR}/${PREFIX}${GP_SEGMENT_ID}
#cat </dev/stdin >${DIR}/${PREFIX}${GP_SEGMENT_ID}
split -l 500000 -a 3 - ${DIR}/${PREFIX}${GP_SEGMENT_ID} </dev/stdin
2 changes: 2 additions & 0 deletions bin/greenplum.properties
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
###########################################################
# Parameters interfacing Hillview with a Greenplum database

hideSuggestions = true

# This script is invoked when data is dumped from an external web table
greenplumDumpScript = /home/gpdamin/hillview/dump-greenplum.sh
# This directory is used to store the data dumped from Greenplum before it's parsed by Hillview.
Expand Down
4 changes: 2 additions & 2 deletions bin/rebuild.sh
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ if [ "x${TOOLSARGS}" != "x" ]; then
fi
export MAVEN_OPTS="-Xmx2048M"
pushd ${mydir}/../platform
mvn ${TOOLSARGS} ${TESTARGS} install
mvn ${TOOLSARGS} ${TESTARGS} clean install
popd
pushd ${mydir}/../web
mvn ${TESTARGS} package
mvn ${TESTARGS} clean package
popd
1 change: 0 additions & 1 deletion data/ontime_big/On_Time_On_Time_Performance_2016_1.csv.gz

This file was deleted.

1 change: 0 additions & 1 deletion data/ontime_big/On_Time_On_Time_Performance_2016_2.csv.gz

This file was deleted.

18 changes: 18 additions & 0 deletions hillview.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
###########################################################
# Parameters influencing the display of the UI

enableSaveAs = true
localDbMenu = true
showTestMenu = true
enableManagement = true
privateIsCsv = true
hideSuggestions = true

###########################################################
# Parameters interfacing Hillview with a Greenplum database

# This script is invoked when data is dumped from an external web table
greenplumDumpScript = /home/gpdamin/hillview/dump-greenplum.sh
# This directory is used to store the data dumped from Greenplum before it's parsed by Hillview.
# The directory must be writable by the segment hosts.
greenplumDumpDirectory = /tmp
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,16 @@ public static class Status implements /* Serializable, implied by IJson */ IJson
/**
* Host where control message executed.
*/
final String hostname;
public final String hostname;
/**
* Some report about the execution status.
*/
final String result;
public final String result;
/**
* Exception caused if any.
*/
@Nullable
final
Throwable exception;
public final Throwable exception;

public Status(String result) {
this.hostname = Utilities.getHostName();
Expand All @@ -69,6 +68,10 @@ public Status(String result, Throwable ex) {
this.exception = ex;
}

public boolean isError() {
return this.exception != null;
}

@Override
public JsonElement toJsonTree() {
JsonObject result = new JsonObject();
Expand Down
2 changes: 1 addition & 1 deletion platform/src/main/java/org/hillview/main/DataUpload.java
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ private Params parseCommand(String[] args) throws Exception {
options.addOption(o_skip);

CommandLineParser parser = new DefaultParser();
CommandLine cmd = null;
CommandLine cmd;
try {
cmd = parser.parse(options, args);
if (cmd == null)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2020 VMware Inc. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.hillview.sketches;

import org.hillview.dataset.api.ControlMessage;
import org.hillview.dataset.api.TableSketch;
import org.hillview.storage.CsvFileLoader;
import org.hillview.table.LazySchema;
import org.hillview.table.Schema;
import org.hillview.table.api.IColumn;
import org.hillview.table.api.ITable;
import org.hillview.table.columns.LazyColumn;
import org.hillview.utils.Converters;
import org.hillview.utils.HillviewLogger;

import javax.annotation.Nullable;

/**
* This is an unorthodox sketch; it actually mutates the table it is operating on
* by loading a few more columns into the table.
*/
public class LoadCsvColumnsSketch
extends ControlMessage.StatusListMonoid
implements TableSketch<ControlMessage.StatusList> {

private final Schema schema;

public LoadCsvColumnsSketch(Schema schema) {
this.schema = schema;
}

@Nullable
@Override
public ControlMessage.StatusList create(@Nullable ITable data) {
HillviewLogger.instance.info("Loading CSV columns for table",
"Columns are {0}", this.schema.toString());
Converters.checkNull(data);
CsvFileLoader.Config config = new CsvFileLoader.Config();
config.hasHeaderRow = true;
CsvFileLoader loader = new CsvFileLoader(
// The data will be in the same source file (data.getSourceFile()) which was used
// initially to load the table.
Converters.checkNull(data.getSourceFile()), config, new LazySchema(this.schema));
ITable loaded = loader.load();
Converters.checkNull(loaded);
for (String c: this.schema.getColumnNames()) {
IColumn ld = loaded.getLoadedColumn(c);
LazyColumn lc = data.getColumn(c).as(LazyColumn.class);
Converters.checkNull(lc);
if (lc.sizeInRows() != ld.sizeInRows())
throw new RuntimeException("Loaded column has different size from original column:" +
" file=" + data.getSourceFile() +
" loaded=" + ld.toString() + " size=" + ld.sizeInRows() +
" original=" + lc.toString() + " size=" + lc.sizeInRows());
Converters.checkNull(lc).setData(ld);
}
return new ControlMessage.StatusList(new ControlMessage.Status("OK"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public class FileSetDescription implements IJson {
public String cookie = null;
/**
* Used for testing: allows reading the same data multiple times.
* 0 is the same as 1.
*/
public int repeat = 1;
/**
Expand All @@ -84,6 +85,11 @@ public class FileSetDescription implements IJson {
* useful for temporary files.
*/
public boolean deleteAfterLoading;
/**
* Actual name of the dataset.
*/
@Nullable
public String name;

@SuppressWarnings("unused")
public String getBasename() {
Expand Down Expand Up @@ -118,24 +124,35 @@ class FileReference implements IFileReference {
public ITable load() {
TextFileLoader loader;
switch (FileSetDescription.this.fileKind) {
case "csv":
case "lazycsv":
/* {
For now treated as equivalent to CSV. This is used for the
csv files dumped by the greenplum database.
CsvFileLoader.Config config = new CsvFileLoader.Config();
config.allowFewerColumns = true;
config.hasHeaderRow = FileSetDescription.this.headerRow;
loader = new LazyCsvFileLoader(
this.pathname, config, FileSetDescription.this.getSchema());
break;
}
*/
case "csv": {
CsvFileLoader.Config config = new CsvFileLoader.Config();
config.allowFewerColumns = true;
config.hasHeaderRow = FileSetDescription.this.headerRow;
loader = new CsvFileLoader(
this.pathname, config, FileSetDescription.this.getSchema());
break;
}
case "orc":
loader = new OrcFileLoader(
this.pathname, FileSetDescription.this.getSchema(), true);
loader = new OrcFileLoader(this.pathname, FileSetDescription.this.getSchema(), true);
break;
case "parquet":
loader = new ParquetFileLoader(
this.pathname, true);
loader = new ParquetFileLoader(this.pathname, true);
break;
case "json":
loader = new JsonFileLoader(
this.pathname, FileSetDescription.this.getSchema());
loader = new JsonFileLoader(this.pathname, FileSetDescription.this.getSchema());
break;
case "hillviewlog":
loader = new HillviewLogs.LogFileLoader(this.pathname);
Expand Down
74 changes: 74 additions & 0 deletions platform/src/main/java/org/hillview/storage/LazyCsvFileLoader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2020 VMware Inc. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.hillview.storage;

import org.hillview.table.ColumnDescription;
import org.hillview.table.LazySchema;
import org.hillview.table.Schema;
import org.hillview.table.Table;
import org.hillview.table.api.IColumn;
import org.hillview.table.api.ITable;
import org.hillview.table.columns.LazyColumn;
import org.hillview.utils.Converters;
import org.hillview.utils.HillviewLogger;

import java.util.List;

/**
* This is a special form of the CsvFileLoader, which only loads the first column of the
* schema when invoked.
* The other columns will be loaded later, as invoked explicitly.
*/
public class LazyCsvFileLoader extends TextFileLoader {
private final LazySchema schema;
CsvFileLoader loader;

public LazyCsvFileLoader(String path, CsvFileLoader.Config configuration, LazySchema schema) {
super(path);
this.schema = schema;
this.allowFewerColumns = configuration.allowFewerColumns;
if (this.schema.isNull())
throw new RuntimeException("Schema guessing not supported for lazy csv loading");
Schema firstColumn = new Schema(Converters.checkNull(
this.schema.getSchema()).getColumnDescriptions().subList(0, 1));
this.loader = new CsvFileLoader(path, configuration, new LazySchema(firstColumn));
}

@Override
public void prepareLoading() {
this.loader.prepareLoading();
}

public ITable loadFragment(int maxRows, boolean skip) {
ITable table = this.loader.loadFragment(maxRows, skip);
int rowCount = table.getNumOfRows();
Schema schema = this.schema.getSchema();
List<ColumnDescription> desc = Converters.checkNull(schema).getColumnDescriptions();
Table result = Table.createLazyTable(desc, rowCount, this.filename, new NoLoader());
String firstColName = this.schema.getSchema().getColumnNames().get(0);
IColumn col0 = table.getLoadedColumn(firstColName);
LazyColumn fc = result.getColumn(firstColName).as(LazyColumn.class);
Converters.checkNull(fc).setData(col0);
return result;
}

@Override
public void endLoading() {
this.loader.endLoading();
}
}
30 changes: 30 additions & 0 deletions platform/src/main/java/org/hillview/storage/NoLoader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
* Copyright (c) 2020 VMware Inc. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.hillview.storage;

import org.hillview.table.api.IColumn;
import org.hillview.table.api.IColumnLoader;

import java.util.List;

public class NoLoader implements IColumnLoader {
@Override
public List<? extends IColumn> loadColumns(List<String> names) {
throw new RuntimeException("Cannot load columns");
}
}
Loading

0 comments on commit 5bee207

Please sign in to comment.