Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration with geopandas #588 #818

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d317c98
Merge remote-tracking branch 'altair-viz/master'
iliatimofeev Mar 27, 2018
e297f20
Merge remote-tracking branch 'altair-viz/master'
iliatimofeev Mar 27, 2018
ea65ca1
Merge remote-tracking branch 'altair-viz/master'
iliatimofeev May 4, 2018
f7d66b7
__geo_interface__ in to_geojson_values
iliatimofeev May 5, 2018
e9d28a5
GeoDataFrame support without dependency of GeoPandas
iliatimofeev May 5, 2018
985d3f6
Merge remote-tracking branch 'altair-viz/master' into it-#588-geopandas
iliatimofeev May 5, 2018
b627c6c
Unused test file
iliatimofeev May 6, 2018
2cd9fde
Full __geo_interface__ support, test_geojson
iliatimofeev May 7, 2018
7db9ff8
test update
iliatimofeev May 7, 2018
950eb72
Mistakenly added .vscode files removed
iliatimofeev May 8, 2018
9f91c00
limit_rows two returns, to_* one if "geo" statement, four spaces for…
iliatimofeev May 8, 2018
76a2af8
geojson_feature()
iliatimofeev May 9, 2018
094a2e7
test_geopandas_examples (hacker version)
iliatimofeev May 13, 2018
df84294
travis-ci: move finalized locals outside try
iliatimofeev May 13, 2018
27a3df8
remove python 3 code
iliatimofeev May 13, 2018
301ea23
flat version
iliatimofeev May 16, 2018
c114acc
flat version
iliatimofeev May 16, 2018
143ad04
Merge remote-tracking branch 'altair-viz/master' into it-#588-geopandas
iliatimofeev May 16, 2018
661447d
Merge remote-tracking branch 'altair-viz/master' into it-#588-geopandas
iliatimofeev May 16, 2018
d2b46e0
GeoPandas ref
iliatimofeev May 17, 2018
649fa21
Merge remote-tracking branch 'altair-viz/master' into it-#588-geopandas
iliatimofeev Jun 10, 2018
89a999e
Merge remote-tracking branch 'altair-viz/master' into it-#588-geopandas
iliatimofeev Jun 10, 2018
80c56d6
merge
iliatimofeev Jun 10, 2018
1bb8192
flake8 fix
iliatimofeev Jun 10, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ target/

.ipynb_checkpoints
.idea/*
.vscode/*
tools/_build
Untitled*.ipynb
.mypy*
Expand Down
29 changes: 28 additions & 1 deletion altair/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def parse_shorthand(shorthand, data=None, parse_aggregates=True,
if parse_types:
patterns = list(itertools.chain(*((p + ':{type}', p) for p in patterns)))

regexps = (re.compile('\A' + p.format(**units) + '\Z', re.DOTALL)
regexps = (re.compile(r'\A' + p.format(**units) + r'\Z', re.DOTALL)
for p in patterns)

# find matches depending on valid fields passed
Expand Down Expand Up @@ -375,3 +375,30 @@ def display_traceback(in_ipython=True):
ip.showtraceback(exc_info)
else:
traceback.print_exception(*exc_info)


def geopandas_to_dict(data):
try:
if ('geometry' != data.geometry.name) and ('geometry' in data.columns) :
warnings.warn("column name 'geometry' is reserved name for GeoDataFrame. "+
"Column named 'geometry' should contain actual displaying geometry or not be used. "+
"Data of column will not be accessible from the chart description. ")
if 'type' in data.columns :
warnings.warn("Column name 'type' is reserved name for GeoDataFrame. "+
"Data of column 'type' will not be accessible from the chart description.")
if 'id' in data.columns :
warnings.warn("Column name 'id' is reserved name for GeoDataFrame for index values. "+
"Data of column 'id' will not be accessible from the chart description.")
return [ dict(row,type = feature['type'],geometry = feature['geometry'], id = feature['id'])
for row,feature in zip(
data.drop(data.geometry.name, axis=1).to_dict('row'),
data.geometry.__geo_interface__['features']
)
]
except AttributeError as err:
if str(err).startswith('No geometry data set yet'):
warnings.warn("GeoDataFrame has no geometry to display.")
return data.to_dict('row')
else:
raise

73 changes: 60 additions & 13 deletions altair/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import random
import hashlib
import random
import warnings

import pandas as pd
from toolz.curried import curry, pipe # noqa
Expand Down Expand Up @@ -33,7 +34,7 @@ class DataTransformerRegistry(PluginRegistry[DataTransformerType]):
# form.
#
# A data model transformer has the following type signature:
# DataModelType = Union[dict, pd.DataFrame]
# DataModelType = Union[dict, pd.DataFrame, gpd.GeoDataFrame, geojson.GeoJSON]
# DataModelTransformerType = Callable[[DataModelType, KwArgs], DataModelType]
# ==============================================================================

Expand All @@ -52,11 +53,10 @@ def limit_rows(data, max_rows=5000):
check_data_type(data)
if isinstance(data, pd.DataFrame):
values = data
elif isinstance(data, dict):
if 'values' in data:
values = data['values']
else:
return data
elif isinstance(data, dict) and ('values' in data):
values = data['values']
else:
return data
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As currently written the function will never progress beyond this line, and the max_rows check will never happen

Copy link
Contributor Author

@iliatimofeev iliatimofeev May 17, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if so, how could it pass test_limit_rows()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or do you mean that it bypass unknown data types, then yes like #887

if max_rows is not None and len(values) > max_rows:
raise MaxRowsError('The number of rows in your dataset is greater '
'than the maximum allowed ({0}). '
Expand All @@ -78,7 +78,6 @@ def sample(data, n=None, frac=None):
values = random.sample(values, n)
return {'values': values}


@curry
def to_json(data, prefix='altair-data', extension='json',
filename="{prefix}-{hash}.{extension}"):
Expand Down Expand Up @@ -117,9 +116,23 @@ def to_csv(data, prefix='altair-data', extension='csv',
def to_values(data):
"""Replace a DataFrame by a data model with values."""
check_data_type(data)
if isinstance(data, pd.DataFrame):

if hasattr(data,'__geo_interface__'):
if isinstance(data, pd.DataFrame): #GeoPandas
data = sanitize_dataframe(data)
return {
'values': _geopandas_to_dict(data),
'format': {'type': 'json'}
}
else:
return {
'values':data.__geo_interface__,
'format': {'type': 'json'}
}
elif isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return {'values': data.to_dict(orient='records')}

elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present.')
Expand All @@ -128,8 +141,8 @@ def to_values(data):

def check_data_type(data):
"""Raise if the data is not a dict or DataFrame."""
if not isinstance(data, (dict, pd.DataFrame)):
raise TypeError('Expected dict or DataFrame, got: {}'.format(type(data)))
if not (isinstance(data, (dict, pd.DataFrame)) or hasattr(data,'__geo_interface__')):
raise TypeError('Expected dict, DataFrame, GeoDataFrame or geojson, got: {}'.format(type(data)))


# ==============================================================================
Expand All @@ -143,7 +156,14 @@ def _compute_data_hash(data_str):
def _data_to_json_string(data):
"""Return a JSON string representation of the input data"""
check_data_type(data)
if isinstance(data, pd.DataFrame):
if hasattr(data,'__geo_interface__'):
if isinstance(data, pd.DataFrame): #GeoPandas
data = sanitize_dataframe(data)
values = _geopandas_to_dict(data)
return json.dumps(values)
else:
return json.dumps(data.__geo_interface__)
elif isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return data.to_json(orient='records')
elif isinstance(data, dict):
Expand All @@ -158,7 +178,9 @@ def _data_to_json_string(data):
def _data_to_csv_string(data):
"""return a CSV string representation of the input data"""
check_data_type(data)
if isinstance(data, pd.DataFrame):
if hasattr(data,'__geo_interface__'):
raise NotImplementedError('use to_json or to_values with GeoJSON objects.')
elif isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return data.to_csv(index=False)
elif isinstance(data, dict):
Expand All @@ -168,3 +190,28 @@ def _data_to_csv_string(data):
else:
raise NotImplementedError("to_csv only works with data expressed as "
"a DataFrame or as a dict")

def _geopandas_to_dict(data):
try:
if ('geometry' != data.geometry.name) and ('geometry' in data.columns) :
warnings.warn("column name 'geometry' is reserved name for GeoDataFrame. "+
"Column named 'geometry' should contain actual displaying geometry or not be used. "+
"Data of column will not be accessible from the chart description. ")
if 'type' in data.columns :
warnings.warn("Column name 'type' is reserved name for GeoDataFrame. "+
"Data of column 'type' will not be accessible from the chart description.")
if 'id' in data.columns :
warnings.warn("Column name 'id' is reserved name for GeoDataFrame for index values. "+
"Data of column 'id' will not be accessible from the chart description.")
return [ dict(row,type = feature['type'],geometry = feature['geometry'], id = feature['id'])
for row,feature in zip(
data.drop(data.geometry.name, axis=1).to_dict('row'),
data.geometry.__geo_interface__['features']
)
]
except AttributeError as err:
if str(err).startswith('No geometry data set yet'):
warnings.warn("GeoDataFrame has no geometry to display.")
return data.to_dict('row')
else:
raise
2 changes: 1 addition & 1 deletion altair/utils/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def test_to_values():


def test_type_error():
"""Ensure that TypeError is raised for types other than dict/DataFrame."""
"""Ensure that TypeError is raised for types other than dict/DataFrame/GeoDataFrame/__geo_interface__."""
for f in (sample, limit_rows, to_values):
with pytest.raises(TypeError):
pipe(0, f)
Expand Down
194 changes: 194 additions & 0 deletions altair/utils/tests/test_geojson.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import pytest
import pandas as pd
import altair.vegalite.v2 as alt

from ..data import pipe, to_values, to_csv
from .. import parse_shorthand


def _create_geojson():
return {
"type": "FeatureCollection",
"bbox": [
-161.30174569731454,
-60.39157788643298,
172.67580002536624,
42.438347020953984
],
"features": [
{
"type": "Feature",
"properties": {"prop": 1},
"geometry": {
"type": "LineString",
"coordinates": [
[-69.2980008004234, 23.18780298146116],
[-161.30174569731454, -60.39157788643298],
[172.67580002536624, 24.151450472748962]
]
},
"id": "0",
"bbox": [
-161.30174569731454,
-60.39157788643298,
172.67580002536624,
24.151450472748962
]
},
{
"type": "Feature",
"properties": {"prop": 2},
"geometry": {
"type": "LineString",
"coordinates": [
[156.03047546751765, 42.438347020953984],
[35.46296546950265, -18.185542212943375],
[152.53211600051463, 23.471406463455793]
]
},
"id": "1",
"bbox": [
35.46296546950265,
-18.185542212943375,
156.03047546751765,
42.438347020953984
]
},
{
"type": "Feature",
"properties": {"prop": 3},
"geometry": {
"type": "LineString",
"coordinates": [
[-133.98414913936503, 25.39468871174894],
[145.04376601680605, 13.058626381790845],
[170.30576801294046, 38.67128737163435]
]
},
"id": "2",
"bbox": [
-133.98414913936503,
13.058626381790845,
170.30576801294046,
38.67128737163435
]
}
]
}

def _create_fake_geo_interface():
class FakeGeoJSON:
__geo_interface__=_create_geojson()
return FakeGeoJSON()

def _create_fake_geodataframe():
class FakeGeoSeries:
__geo_interface__=_create_geojson()
def __init__(self, geometry_name = 'geometry'):
self.name = geometry_name

class FakeGeoDataFrame(pd.DataFrame):
__geo_interface__ = _create_geojson()
geometry = FakeGeoSeries()
def copy(self, deep=True):
data = self._data
if deep:
data = data.copy()
return FakeGeoDataFrame(data).__finalize__(self)
def drop(self, labels=None, axis=0,**kwargs):
if (axis == 1) and (self.geometry.name == labels):
return self.copy()
return super(FakeGeoDataFrame,self).drop(labels, axis,**kwargs)

return FakeGeoDataFrame({'prop':[1,2,3]})

def test_to_values_geo():
"""Test the to_values data transformer."""

data = _create_fake_geodataframe()
result = pipe(data, to_values)
assert result['format'] == {'type':'json'}
assert result['values'][1]['geometry']==data.__geo_interface__['features'][1]['geometry']
assert result['values'][1]['type']==data.__geo_interface__['features'][1]['type']

data = _create_fake_geo_interface()
result = pipe(data, to_values)
assert result['format'] == {'type':'json'}
assert result['values']==data.__geo_interface__

def test_chart_data_geotypes():
Chart = lambda data,**arg: alt.Chart(data).mark_geoshape().project().encode(**arg)

# Fake GeoPandas
data = _create_fake_geodataframe()
dct = Chart(data,fill='prop').to_dict()
assert dct['data']['values'][1]['geometry']==data.__geo_interface__['features'][1]['geometry']
assert dct['data']['values'][1]['type']==data.__geo_interface__['features'][1]['type']

# Fake GeoInterface
data = _create_fake_geo_interface()
dct = Chart(data).to_dict()
assert dct['data']['format'] == {'type':'json'}
assert dct['data']['values'] == data.__geo_interface__

def test_parse_shorthand_with_geodata():
def check(s, data, **kwargs):
assert parse_shorthand(s, data) == kwargs

data = _create_fake_geodataframe()

check('prop', data, field='prop', type='quantitative')
check('prop:N', data, field='prop', type='nominal')
check('count(prop)', data, field='prop', aggregate='count', type='quantitative')

data = _create_fake_geo_interface()

check('properties.prop:Q', data, field='properties.prop', type='quantitative')
check('prop', data, field='prop')

def test_to_csv_geo():
"""Test the to_csv raise error with geopandas."""

data = _create_fake_geodataframe()
with pytest.raises(NotImplementedError):
pipe(data, to_csv)

def test_geo_pandas():
gpd = pytest.importorskip('geopandas')

data = gpd.GeoDataFrame.from_features(_create_geojson())
dct = alt.Chart(data).mark_geoshape().project().encode(fill='prop').to_dict()

assert dct['data']['format'] == {'type':'json'}
assert dct['encoding'] == {'fill': {'field': 'prop', 'type': 'quantitative'}}
data2 = gpd.GeoDataFrame.from_features({
'type':'FeatureCollection',
'features':[{'type':item['type'],
'geometry':item['geometry'],
'id':item['id'],
'properties':{ k: item[k]
for k in item.keys()
if k not in ('type','geometry')
}
} for item in dct['data']['values']]
})

assert (data2[data.columns] == data).all().all()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would data2[data.columns].equals(data) not work here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should work :) and looks more readable


def test_geojson_feature():
Chart = lambda data,**arg: alt.Chart(alt.geojson_feature(data,'test_prop')
).mark_geoshape().project().encode(**arg)

# Fake GeoInterface
data = _create_fake_geo_interface()
dct = Chart(data).to_dict()

assert dct['data']['format'] == {'type':'json','property':'test_prop'}
assert dct['data']['values'] == data.__geo_interface__

# url
data = "url.json"
dct = Chart(data).to_dict()

assert dct['data']['format'] == {'type':'json','property':'test_prop'}
assert dct['data']['url'] == data
Loading