-
Notifications
You must be signed in to change notification settings - Fork 794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Integration with geopandas #588 #818
Changes from 20 commits
d317c98
e297f20
ea65ca1
f7d66b7
e9d28a5
985d3f6
b627c6c
2cd9fde
7db9ff8
950eb72
9f91c00
76a2af8
094a2e7
df84294
27a3df8
301ea23
c114acc
143ad04
661447d
d2b46e0
649fa21
89a999e
80c56d6
1bb8192
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,6 +62,7 @@ target/ | |
|
||
.ipynb_checkpoints | ||
.idea/* | ||
.vscode/* | ||
tools/_build | ||
Untitled*.ipynb | ||
.mypy* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
import json | ||
import random | ||
import uuid | ||
import warnings | ||
|
||
import pandas as pd | ||
from toolz.curried import curry, pipe # noqa | ||
|
@@ -33,7 +34,7 @@ class DataTransformerRegistry(PluginRegistry[DataTransformerType]): | |
# form. | ||
# | ||
# A data model transformer has the following type signature: | ||
# DataModelType = Union[dict, pd.DataFrame] | ||
# DataModelType = Union[dict, pd.DataFrame, gpd.GeoDataFrame, geojson.GeoJSON] | ||
# DataModelTransformerType = Callable[[DataModelType, KwArgs], DataModelType] | ||
# ============================================================================== | ||
|
||
|
@@ -52,11 +53,10 @@ def limit_rows(data, max_rows=5000): | |
check_data_type(data) | ||
if isinstance(data, pd.DataFrame): | ||
values = data | ||
elif isinstance(data, dict): | ||
if 'values' in data: | ||
values = data['values'] | ||
else: | ||
return data | ||
elif isinstance(data, dict) and ('values' in data): | ||
values = data['values'] | ||
else: | ||
return data | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As currently written the function will never progress beyond this line, and the max_rows check will never happen There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if so, how could it pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or do you mean that it bypass unknown data types, then yes like #887 |
||
if max_rows is not None and len(values) > max_rows: | ||
raise MaxRowsError('The number of rows in your dataset is greater ' | ||
'than the maximum allowed ({0}). ' | ||
|
@@ -78,25 +78,63 @@ def sample(data, n=None, frac=None): | |
values = random.sample(values, n) | ||
return {'values': values} | ||
|
||
def _geopandas_to_dict(data): | ||
try: | ||
if ('geometry' != data.geometry.name) and ('geometry' in data.columns) : | ||
warnings.warn("column name 'geometry' is reserved name for GeoDataFrame. "+ | ||
"Column named 'geometry' should contain actual displaying geometry or not be used. "+ | ||
"Data of column will not be accessible from the chart description. ") | ||
if 'type' in data.columns : | ||
warnings.warn("Column name 'type' is reserved name for GeoDataFrame. "+ | ||
"Data of column 'type' will not be accessible from the chart description.") | ||
if 'id' in data.columns : | ||
warnings.warn("Column name 'id' is reserved name for GeoDataFrame for index values. "+ | ||
"Data of column 'id' will not be accessible from the chart description.") | ||
return [ dict(row,type = feature['type'],geometry = feature['geometry'], id = feature['id']) | ||
for row,feature in zip( | ||
data.drop(data.geometry.name, axis=1).to_dict('row'), | ||
data.geometry.__geo_interface__['features'] | ||
) | ||
] | ||
|
||
except AttributeError as err: | ||
if str(err).startswith('No geometry data set yet'): | ||
warnings.warn("GeoDataFrame has no geometry to display.") | ||
return data.to_dict('row') | ||
else: | ||
raise | ||
|
||
@curry | ||
def to_json(data, prefix='altair-data'): | ||
"""Write the data model to a .json file and return a url based data model.""" | ||
check_data_type(data) | ||
ext = '.json' | ||
filename = _compute_filename(prefix=prefix, ext=ext) | ||
if isinstance(data, pd.DataFrame): | ||
data_format = {'type': 'json'} | ||
|
||
if hasattr(data,'__geo_interface__'): | ||
if isinstance(data, pd.DataFrame): #GeoPandas | ||
data = sanitize_dataframe(data) | ||
values = _geopandas_to_dict(data) | ||
with open(filename,'w') as f: | ||
json.dump(values, f) | ||
else: | ||
with open(filename,'w') as f: | ||
json.dump(data.__geo_interface__, f) | ||
|
||
elif isinstance(data, pd.DataFrame): | ||
data = sanitize_dataframe(data) | ||
data.to_json(filename, orient='records') | ||
|
||
elif isinstance(data, dict): | ||
if 'values' not in data: | ||
raise KeyError('values expected in data dict, but not present.') | ||
values = data['values'] | ||
with open(filename) as f: | ||
with open(filename,'w') as f: | ||
json.dump(values, f) | ||
return { | ||
'url': filename, | ||
'format': {'type': 'json'} | ||
'format': data_format | ||
} | ||
|
||
|
||
|
@@ -106,7 +144,10 @@ def to_csv(data, prefix='altair-data'): | |
check_data_type(data) | ||
ext = '.csv' | ||
filename = _compute_filename(prefix=prefix, ext=ext) | ||
if isinstance(data, pd.DataFrame): | ||
if hasattr(data,'__geo_interface__'): | ||
raise NotImplementedError('use to_json or to_values with GeoJSON objects.') | ||
|
||
elif isinstance(data, pd.DataFrame): | ||
data = sanitize_dataframe(data) | ||
data.to_csv(filename) | ||
return { | ||
|
@@ -121,9 +162,22 @@ def to_csv(data, prefix='altair-data'): | |
def to_values(data): | ||
"""Replace a DataFrame by a data model with values.""" | ||
check_data_type(data) | ||
if isinstance(data, pd.DataFrame): | ||
|
||
if hasattr(data,'__geo_interface__'): | ||
if isinstance(data, pd.DataFrame): #GeoPandas | ||
data = sanitize_dataframe(data) | ||
return {'values': _geopandas_to_dict(data), | ||
'format': {'type': 'json'}} | ||
else: | ||
return { | ||
'values':data.__geo_interface__, | ||
'format': {'type': 'json'}, | ||
} | ||
|
||
elif isinstance(data, pd.DataFrame): | ||
data = sanitize_dataframe(data) | ||
return {'values': data.to_dict(orient='records')} | ||
|
||
elif isinstance(data, dict): | ||
if 'values' not in data: | ||
raise KeyError('values expected in data dict, but not present.') | ||
|
@@ -132,8 +186,8 @@ def to_values(data): | |
|
||
def check_data_type(data): | ||
"""Raise if the data is not a dict or DataFrame.""" | ||
if not isinstance(data, (dict, pd.DataFrame)): | ||
raise TypeError('Expected dict or DataFrame, got: {}'.format(type(data))) | ||
if not (isinstance(data, (dict, pd.DataFrame)) or hasattr(data,'__geo_interface__')): | ||
raise TypeError('Expected dict, DataFrame, GeoDataFrame or geojson, got: {}'.format(type(data))) | ||
|
||
|
||
# ============================================================================== | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import pytest | ||
import pandas as pd | ||
import altair.vegalite.v2 as alt | ||
|
||
from ..data import pipe, to_values, to_csv | ||
from .. import parse_shorthand | ||
|
||
|
||
def _create_geojson(): | ||
return { | ||
"type": "FeatureCollection", | ||
"bbox": [ | ||
-161.30174569731454, | ||
-60.39157788643298, | ||
172.67580002536624, | ||
42.438347020953984 | ||
], | ||
"features": [ | ||
{ | ||
"type": "Feature", | ||
"properties": {"prop": 1}, | ||
"geometry": { | ||
"type": "LineString", | ||
"coordinates": [ | ||
[-69.2980008004234, 23.18780298146116], | ||
[-161.30174569731454, -60.39157788643298], | ||
[172.67580002536624, 24.151450472748962] | ||
] | ||
}, | ||
"id": "0", | ||
"bbox": [ | ||
-161.30174569731454, | ||
-60.39157788643298, | ||
172.67580002536624, | ||
24.151450472748962 | ||
] | ||
}, | ||
{ | ||
"type": "Feature", | ||
"properties": {"prop": 2}, | ||
"geometry": { | ||
"type": "LineString", | ||
"coordinates": [ | ||
[156.03047546751765, 42.438347020953984], | ||
[35.46296546950265, -18.185542212943375], | ||
[152.53211600051463, 23.471406463455793] | ||
] | ||
}, | ||
"id": "1", | ||
"bbox": [ | ||
35.46296546950265, | ||
-18.185542212943375, | ||
156.03047546751765, | ||
42.438347020953984 | ||
] | ||
}, | ||
{ | ||
"type": "Feature", | ||
"properties": {"prop": 3}, | ||
"geometry": { | ||
"type": "LineString", | ||
"coordinates": [ | ||
[-133.98414913936503, 25.39468871174894], | ||
[145.04376601680605, 13.058626381790845], | ||
[170.30576801294046, 38.67128737163435] | ||
] | ||
}, | ||
"id": "2", | ||
"bbox": [ | ||
-133.98414913936503, | ||
13.058626381790845, | ||
170.30576801294046, | ||
38.67128737163435 | ||
] | ||
} | ||
] | ||
} | ||
|
||
def _create_fake_geo_interface(): | ||
class FakeGeoJSON: | ||
__geo_interface__=_create_geojson() | ||
return FakeGeoJSON() | ||
|
||
def _create_fake_geodataframe(): | ||
class FakeGeoSeries: | ||
__geo_interface__=_create_geojson() | ||
def __init__(self, geometry_name = 'geometry'): | ||
self.name = geometry_name | ||
|
||
class FakeGeoDataFrame(pd.DataFrame): | ||
__geo_interface__ = _create_geojson() | ||
geometry = FakeGeoSeries() | ||
def copy(self, deep=True): | ||
data = self._data | ||
if deep: | ||
data = data.copy() | ||
return FakeGeoDataFrame(data).__finalize__(self) | ||
def drop(self, labels=None, axis=0,**kwargs): | ||
if (axis == 1) and (self.geometry.name == labels): | ||
return self.copy() | ||
return super(FakeGeoDataFrame,self).drop(labels, axis,**kwargs) | ||
|
||
return FakeGeoDataFrame({'prop':[1,2,3]}) | ||
|
||
def test_to_values_geo(): | ||
"""Test the to_values data transformer.""" | ||
|
||
data = _create_fake_geodataframe() | ||
result = pipe(data, to_values) | ||
assert result['format'] == {'type':'json'} | ||
assert result['values'][1]['geometry']==data.__geo_interface__['features'][1]['geometry'] | ||
assert result['values'][1]['type']==data.__geo_interface__['features'][1]['type'] | ||
|
||
data = _create_fake_geo_interface() | ||
result = pipe(data, to_values) | ||
assert result['format'] == {'type':'json'} | ||
assert result['values']==data.__geo_interface__ | ||
|
||
def test_chart_data_geotypes(): | ||
Chart = lambda data,**arg: alt.Chart(data).mark_geoshape().project().encode(**arg) | ||
|
||
# Fake GeoPandas | ||
data = _create_fake_geodataframe() | ||
dct = Chart(data,fill='prop').to_dict() | ||
assert dct['data']['values'][1]['geometry']==data.__geo_interface__['features'][1]['geometry'] | ||
assert dct['data']['values'][1]['type']==data.__geo_interface__['features'][1]['type'] | ||
|
||
# Fake GeoInterface | ||
data = _create_fake_geo_interface() | ||
dct = Chart(data).to_dict() | ||
assert dct['data']['format'] == {'type':'json'} | ||
assert dct['data']['values'] == data.__geo_interface__ | ||
|
||
def test_parse_shorthand_with_geodata(): | ||
def check(s, data, **kwargs): | ||
assert parse_shorthand(s, data) == kwargs | ||
|
||
data = _create_fake_geodataframe() | ||
|
||
check('prop', data, field='prop', type='quantitative') | ||
check('prop:N', data, field='prop', type='nominal') | ||
check('count(prop)', data, field='prop', aggregate='count', type='quantitative') | ||
|
||
data = _create_fake_geo_interface() | ||
|
||
check('properties.prop:Q', data, field='properties.prop', type='quantitative') | ||
check('prop', data, field='prop') | ||
|
||
def test_to_csv_geo(): | ||
"""Test the to_csv raise error with geopandas.""" | ||
|
||
data = _create_fake_geodataframe() | ||
with pytest.raises(NotImplementedError): | ||
pipe(data, to_csv) | ||
|
||
def test_geo_pandas(): | ||
gpd = pytest.importorskip('geopandas') | ||
|
||
data = gpd.GeoDataFrame.from_features(_create_geojson()) | ||
dct = alt.Chart(data).mark_geoshape().project().encode(fill='prop').to_dict() | ||
|
||
assert dct['data']['format'] == {'type':'json'} | ||
assert dct['encoding'] == {'fill': {'field': 'prop', 'type': 'quantitative'}} | ||
data2 = gpd.GeoDataFrame.from_features({ | ||
'type':'FeatureCollection', | ||
'features':[{'type':item['type'], | ||
'geometry':item['geometry'], | ||
'id':item['id'], | ||
'properties':{ k: item[k] | ||
for k in item.keys() | ||
if k not in ('type','geometry') | ||
} | ||
} for item in dct['data']['values']] | ||
}) | ||
|
||
assert (data2[data.columns] == data).all().all() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should work :) and looks more readable |
||
|
||
def test_geojson_feature(): | ||
Chart = lambda data,**arg: alt.Chart(alt.geojson_feature(data,'test_prop') | ||
).mark_geoshape().project().encode(**arg) | ||
|
||
# Fake GeoInterface | ||
data = _create_fake_geo_interface() | ||
dct = Chart(data).to_dict() | ||
|
||
assert dct['data']['format'] == {'type':'json','property':'test_prop'} | ||
assert dct['data']['values'] == data.__geo_interface__ | ||
|
||
# url | ||
data = "url.json" | ||
dct = Chart(data).to_dict() | ||
|
||
assert dct['data']['format'] == {'type':'json','property':'test_prop'} | ||
assert dct['data']['url'] == data |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that there are three return statements in this code, the logic is pretty opaque (it took me a bit to read this and figure out what it was doing).
I think the function should be refactored for clarity.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done