diff --git a/.gitignore b/.gitignore index 37b3af1f3..e91d46e52 100644 --- a/.gitignore +++ b/.gitignore @@ -62,6 +62,7 @@ target/ .ipynb_checkpoints .idea/* +.vscode/* tools/_build Untitled*.ipynb .mypy* diff --git a/altair/utils/core.py b/altair/utils/core.py index 0b18638dd..84876670c 100644 --- a/altair/utils/core.py +++ b/altair/utils/core.py @@ -233,7 +233,7 @@ def parse_shorthand(shorthand, data=None, parse_aggregates=True, if parse_types: patterns = list(itertools.chain(*((p + ':{type}', p) for p in patterns))) - regexps = (re.compile('\A' + p.format(**units) + '\Z', re.DOTALL) + regexps = (re.compile(r'\A' + p.format(**units) + r'\Z', re.DOTALL) for p in patterns) # find matches depending on valid fields passed @@ -375,3 +375,30 @@ def display_traceback(in_ipython=True): ip.showtraceback(exc_info) else: traceback.print_exception(*exc_info) + + +def geopandas_to_dict(data): + try: + if ('geometry' != data.geometry.name) and ('geometry' in data.columns) : + warnings.warn("column name 'geometry' is reserved name for GeoDataFrame. "+ + "Column named 'geometry' should contain actual displaying geometry or not be used. "+ + "Data of column will not be accessible from the chart description. ") + if 'type' in data.columns : + warnings.warn("Column name 'type' is reserved name for GeoDataFrame. "+ + "Data of column 'type' will not be accessible from the chart description.") + if 'id' in data.columns : + warnings.warn("Column name 'id' is reserved name for GeoDataFrame for index values. "+ + "Data of column 'id' will not be accessible from the chart description.") + return [ dict(row,type = feature['type'],geometry = feature['geometry'], id = feature['id']) + for row,feature in zip( + data.drop(data.geometry.name, axis=1).to_dict('row'), + data.geometry.__geo_interface__['features'] + ) + ] + except AttributeError as err: + if str(err).startswith('No geometry data set yet'): + warnings.warn("GeoDataFrame has no geometry to display.") + return data.to_dict('row') + else: + raise + \ No newline at end of file diff --git a/altair/utils/data.py b/altair/utils/data.py index 1b73de58f..3fe9b0018 100644 --- a/altair/utils/data.py +++ b/altair/utils/data.py @@ -1,6 +1,7 @@ import json -import random import hashlib +import random +import warnings import pandas as pd from toolz.curried import curry, pipe # noqa @@ -33,7 +34,7 @@ class DataTransformerRegistry(PluginRegistry[DataTransformerType]): # form. # # A data model transformer has the following type signature: -# DataModelType = Union[dict, pd.DataFrame] +# DataModelType = Union[dict, pd.DataFrame, gpd.GeoDataFrame, geojson.GeoJSON] # DataModelTransformerType = Callable[[DataModelType, KwArgs], DataModelType] # ============================================================================== @@ -52,11 +53,10 @@ def limit_rows(data, max_rows=5000): check_data_type(data) if isinstance(data, pd.DataFrame): values = data - elif isinstance(data, dict): - if 'values' in data: - values = data['values'] - else: - return data + elif isinstance(data, dict) and ('values' in data): + values = data['values'] + else: + return data if max_rows is not None and len(values) > max_rows: raise MaxRowsError('The number of rows in your dataset is greater ' 'than the maximum allowed ({0}). ' @@ -78,7 +78,6 @@ def sample(data, n=None, frac=None): values = random.sample(values, n) return {'values': values} - @curry def to_json(data, prefix='altair-data', extension='json', filename="{prefix}-{hash}.{extension}"): @@ -117,9 +116,23 @@ def to_csv(data, prefix='altair-data', extension='csv', def to_values(data): """Replace a DataFrame by a data model with values.""" check_data_type(data) - if isinstance(data, pd.DataFrame): + + if hasattr(data,'__geo_interface__'): + if isinstance(data, pd.DataFrame): #GeoPandas + data = sanitize_dataframe(data) + return { + 'values': _geopandas_to_dict(data), + 'format': {'type': 'json'} + } + else: + return { + 'values':data.__geo_interface__, + 'format': {'type': 'json'} + } + elif isinstance(data, pd.DataFrame): data = sanitize_dataframe(data) return {'values': data.to_dict(orient='records')} + elif isinstance(data, dict): if 'values' not in data: raise KeyError('values expected in data dict, but not present.') @@ -128,8 +141,8 @@ def to_values(data): def check_data_type(data): """Raise if the data is not a dict or DataFrame.""" - if not isinstance(data, (dict, pd.DataFrame)): - raise TypeError('Expected dict or DataFrame, got: {}'.format(type(data))) + if not (isinstance(data, (dict, pd.DataFrame)) or hasattr(data,'__geo_interface__')): + raise TypeError('Expected dict, DataFrame, GeoDataFrame or geojson, got: {}'.format(type(data))) # ============================================================================== @@ -143,7 +156,14 @@ def _compute_data_hash(data_str): def _data_to_json_string(data): """Return a JSON string representation of the input data""" check_data_type(data) - if isinstance(data, pd.DataFrame): + if hasattr(data,'__geo_interface__'): + if isinstance(data, pd.DataFrame): #GeoPandas + data = sanitize_dataframe(data) + values = _geopandas_to_dict(data) + return json.dumps(values) + else: + return json.dumps(data.__geo_interface__) + elif isinstance(data, pd.DataFrame): data = sanitize_dataframe(data) return data.to_json(orient='records') elif isinstance(data, dict): @@ -158,7 +178,9 @@ def _data_to_json_string(data): def _data_to_csv_string(data): """return a CSV string representation of the input data""" check_data_type(data) - if isinstance(data, pd.DataFrame): + if hasattr(data,'__geo_interface__'): + raise NotImplementedError('use to_json or to_values with GeoJSON objects.') + elif isinstance(data, pd.DataFrame): data = sanitize_dataframe(data) return data.to_csv(index=False) elif isinstance(data, dict): @@ -168,3 +190,28 @@ def _data_to_csv_string(data): else: raise NotImplementedError("to_csv only works with data expressed as " "a DataFrame or as a dict") + +def _geopandas_to_dict(data): + try: + if ('geometry' != data.geometry.name) and ('geometry' in data.columns) : + warnings.warn("column name 'geometry' is reserved name for GeoDataFrame. "+ + "Column named 'geometry' should contain actual displaying geometry or not be used. "+ + "Data of column will not be accessible from the chart description. ") + if 'type' in data.columns : + warnings.warn("Column name 'type' is reserved name for GeoDataFrame. "+ + "Data of column 'type' will not be accessible from the chart description.") + if 'id' in data.columns : + warnings.warn("Column name 'id' is reserved name for GeoDataFrame for index values. "+ + "Data of column 'id' will not be accessible from the chart description.") + return [ dict(row,type = feature['type'],geometry = feature['geometry'], id = feature['id']) + for row,feature in zip( + data.drop(data.geometry.name, axis=1).to_dict('row'), + data.geometry.__geo_interface__['features'] + ) + ] + except AttributeError as err: + if str(err).startswith('No geometry data set yet'): + warnings.warn("GeoDataFrame has no geometry to display.") + return data.to_dict('row') + else: + raise diff --git a/altair/utils/tests/test_data.py b/altair/utils/tests/test_data.py index b50b2c410..00046da20 100644 --- a/altair/utils/tests/test_data.py +++ b/altair/utils/tests/test_data.py @@ -62,7 +62,7 @@ def test_to_values(): def test_type_error(): - """Ensure that TypeError is raised for types other than dict/DataFrame.""" + """Ensure that TypeError is raised for types other than dict/DataFrame/GeoDataFrame/__geo_interface__.""" for f in (sample, limit_rows, to_values): with pytest.raises(TypeError): pipe(0, f) diff --git a/altair/utils/tests/test_geojson.py b/altair/utils/tests/test_geojson.py new file mode 100644 index 000000000..357858656 --- /dev/null +++ b/altair/utils/tests/test_geojson.py @@ -0,0 +1,194 @@ +import pytest +import pandas as pd +import altair.vegalite.v2 as alt + +from ..data import pipe, to_values, to_csv +from .. import parse_shorthand + + +def _create_geojson(): + return { + "type": "FeatureCollection", + "bbox": [ + -161.30174569731454, + -60.39157788643298, + 172.67580002536624, + 42.438347020953984 + ], + "features": [ + { + "type": "Feature", + "properties": {"prop": 1}, + "geometry": { + "type": "LineString", + "coordinates": [ + [-69.2980008004234, 23.18780298146116], + [-161.30174569731454, -60.39157788643298], + [172.67580002536624, 24.151450472748962] + ] + }, + "id": "0", + "bbox": [ + -161.30174569731454, + -60.39157788643298, + 172.67580002536624, + 24.151450472748962 + ] + }, + { + "type": "Feature", + "properties": {"prop": 2}, + "geometry": { + "type": "LineString", + "coordinates": [ + [156.03047546751765, 42.438347020953984], + [35.46296546950265, -18.185542212943375], + [152.53211600051463, 23.471406463455793] + ] + }, + "id": "1", + "bbox": [ + 35.46296546950265, + -18.185542212943375, + 156.03047546751765, + 42.438347020953984 + ] + }, + { + "type": "Feature", + "properties": {"prop": 3}, + "geometry": { + "type": "LineString", + "coordinates": [ + [-133.98414913936503, 25.39468871174894], + [145.04376601680605, 13.058626381790845], + [170.30576801294046, 38.67128737163435] + ] + }, + "id": "2", + "bbox": [ + -133.98414913936503, + 13.058626381790845, + 170.30576801294046, + 38.67128737163435 + ] + } + ] + } + +def _create_fake_geo_interface(): + class FakeGeoJSON: + __geo_interface__=_create_geojson() + return FakeGeoJSON() + +def _create_fake_geodataframe(): + class FakeGeoSeries: + __geo_interface__=_create_geojson() + def __init__(self, geometry_name = 'geometry'): + self.name = geometry_name + + class FakeGeoDataFrame(pd.DataFrame): + __geo_interface__ = _create_geojson() + geometry = FakeGeoSeries() + def copy(self, deep=True): + data = self._data + if deep: + data = data.copy() + return FakeGeoDataFrame(data).__finalize__(self) + def drop(self, labels=None, axis=0,**kwargs): + if (axis == 1) and (self.geometry.name == labels): + return self.copy() + return super(FakeGeoDataFrame,self).drop(labels, axis,**kwargs) + + return FakeGeoDataFrame({'prop':[1,2,3]}) + +def test_to_values_geo(): + """Test the to_values data transformer.""" + + data = _create_fake_geodataframe() + result = pipe(data, to_values) + assert result['format'] == {'type':'json'} + assert result['values'][1]['geometry']==data.__geo_interface__['features'][1]['geometry'] + assert result['values'][1]['type']==data.__geo_interface__['features'][1]['type'] + + data = _create_fake_geo_interface() + result = pipe(data, to_values) + assert result['format'] == {'type':'json'} + assert result['values']==data.__geo_interface__ + +def test_chart_data_geotypes(): + Chart = lambda data,**arg: alt.Chart(data).mark_geoshape().project().encode(**arg) + + # Fake GeoPandas + data = _create_fake_geodataframe() + dct = Chart(data,fill='prop').to_dict() + assert dct['data']['values'][1]['geometry']==data.__geo_interface__['features'][1]['geometry'] + assert dct['data']['values'][1]['type']==data.__geo_interface__['features'][1]['type'] + + # Fake GeoInterface + data = _create_fake_geo_interface() + dct = Chart(data).to_dict() + assert dct['data']['format'] == {'type':'json'} + assert dct['data']['values'] == data.__geo_interface__ + +def test_parse_shorthand_with_geodata(): + def check(s, data, **kwargs): + assert parse_shorthand(s, data) == kwargs + + data = _create_fake_geodataframe() + + check('prop', data, field='prop', type='quantitative') + check('prop:N', data, field='prop', type='nominal') + check('count(prop)', data, field='prop', aggregate='count', type='quantitative') + + data = _create_fake_geo_interface() + + check('properties.prop:Q', data, field='properties.prop', type='quantitative') + check('prop', data, field='prop') + +def test_to_csv_geo(): + """Test the to_csv raise error with geopandas.""" + + data = _create_fake_geodataframe() + with pytest.raises(NotImplementedError): + pipe(data, to_csv) + +def test_geo_pandas(): + gpd = pytest.importorskip('geopandas') + + data = gpd.GeoDataFrame.from_features(_create_geojson()) + dct = alt.Chart(data).mark_geoshape().project().encode(fill='prop').to_dict() + + assert dct['data']['format'] == {'type':'json'} + assert dct['encoding'] == {'fill': {'field': 'prop', 'type': 'quantitative'}} + data2 = gpd.GeoDataFrame.from_features({ + 'type':'FeatureCollection', + 'features':[{'type':item['type'], + 'geometry':item['geometry'], + 'id':item['id'], + 'properties':{ k: item[k] + for k in item.keys() + if k not in ('type','geometry') + } + } for item in dct['data']['values']] + }) + + assert (data2[data.columns] == data).all().all() + +def test_geojson_feature(): + Chart = lambda data,**arg: alt.Chart(alt.geojson_feature(data,'test_prop') + ).mark_geoshape().project().encode(**arg) + + # Fake GeoInterface + data = _create_fake_geo_interface() + dct = Chart(data).to_dict() + + assert dct['data']['format'] == {'type':'json','property':'test_prop'} + assert dct['data']['values'] == data.__geo_interface__ + + # url + data = "url.json" + dct = Chart(data).to_dict() + + assert dct['data']['format'] == {'type':'json','property':'test_prop'} + assert dct['data']['url'] == data diff --git a/altair/vegalite/v2/api.py b/altair/vegalite/v2/api.py index 7ef6b2a98..3bb1bc604 100644 --- a/altair/vegalite/v2/api.py +++ b/altair/vegalite/v2/api.py @@ -9,6 +9,7 @@ from .schema import core, channels, mixins, Undefined, SCHEMA_URL from .data import data_transformers, pipe +from ..data import sanitize_dataframe from ... import utils, expr from .display import renderers, VEGALITE_VERSION, VEGAEMBED_VERSION, VEGA_VERSION from .theme import themes @@ -19,11 +20,11 @@ def _prepare_data(data): """Convert input data to data for use within schema""" if data is Undefined: return data + elif isinstance(data, pd.DataFrame) or hasattr(data, '__geo_interface__'): + return pipe(data, data_transformers.get()) elif isinstance(data, (dict, core.Data, core.InlineData, core.UrlData, core.NamedData)): return data - elif isinstance(data, pd.DataFrame): - return pipe(data, data_transformers.get()) elif isinstance(data, six.string_types): return core.UrlData(data) else: @@ -1408,3 +1409,33 @@ def topo_feature(url, feature, **kwargs): """ return core.UrlData(url=url, format=core.TopoDataFormat(type='topojson', feature=feature, **kwargs)) +def geojson_feature(data, feature, **kwargs): + """A convenience function for extracting features from a geojson object or url + + Parameters + ---------- + data : anyOf(string, geojson.GeoJSON) + string is interpreted as URL from which to load the data set. + geojson.GeoJSON is interpreted as data set itself. + + feature : string + The JSON property containing the GeoJSON object set to convert to + a GeoJSON feature collection. For example ``features[0].geometry``. + + **kwargs : + additional keywords passed to JsonDataFormat + + """ + if isinstance(data, six.string_types): + return core.UrlData(url=data, format=core.JsonDataFormat(type='json', + property=feature, **kwargs)) + elif hasattr(data,'__geo_interface__'): + if isinstance(data, pd.DataFrame): #GeoPandas + data = sanitize_dataframe(data) + return core.InlineData(values=data.__geo_interface__ , format=core.JsonDataFormat(type='json', + property=feature, **kwargs)) + else: + warnings.warn("data of type {0} not recognized".format(type(data))) + return data + + diff --git a/doc/user_guide/data.rst b/doc/user_guide/data.rst index 4e980e6ee..5872e4ac4 100644 --- a/doc/user_guide/data.rst +++ b/doc/user_guide/data.rst @@ -8,11 +8,13 @@ Specifying Data in Altair Each top-level chart object (i.e. :class:`Chart`, :class:`LayerChart`, and :class:`VConcatChart`, :class:`HConcatChart``, :class:`RepeatChart`, :class:`FacetChart`) accepts a dataset as its first argument. -The dataset can be specified in one of three ways: +The dataset can be specified in one of five ways: - as a `Pandas DataFrame `_ - as a :class:`Data` or related object (i.e. :class:`UrlData`, :class:`InlineData`, :class:`NamedData`) - as a url string pointing to a ``json`` or ``csv`` formatted text file +- as a GeoPandas_ GeoDataFrame +- as a GeoJSON_ object For example, here we specify data via a DataFrame: @@ -28,7 +30,7 @@ For example, here we specify data via a DataFrame: y='y', ) -When data is specified as a DataFrame, the encoding is quite simple, as Altair +When data is specified as a DataFrame (or GeoDataFrame), the encoding is quite simple, as Altair uses the data type information provided by Pandas to automatically determine the data types required in the encoding. @@ -179,3 +181,5 @@ For more information on the ``melt`` method, see the `Pandas melt documentation` .. _Pandas pivot documentation: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pivot.html .. _Pandas melt documentation: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.melt.html#pandas.DataFrame.melt .. _Reshaping and Pivot Tables: https://pandas.pydata.org/pandas-docs/stable/reshaping.html +.. _GeoPandas: http://geopandas.org +.. _GeoJSON: https://pypi.org/project/geojson/ \ No newline at end of file