Merge pull request #34 from john-sandall/csse-data

Add dataset: "coronavirus/CSSE"
john-sandall · Mar 15, 2020 · abc53df · abc53df
2 parents 762b1d3 + aa6ada7
commit abc53df
Show file tree

Hide file tree

Showing 13 changed files with 471 additions and 66 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,7 @@ maven.egg-info/
 
 # IDE ignores
 .vscode
+
+# Checklists
+DEPLOY
+REVIEW
diff --git a/README.md b/README.md
@@ -28,6 +28,9 @@ Data dictionaries for all datasets are available by clicking on the dataset's na
 
 | Dataset | Description | Date | Source | Licence |
 | -- | -- | -- | -- | -- |
+| **Coronavirus Datasets** |
+| [**`coronavirus/CSSE`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/coronavirus) | Daily CSSE cases/deaths/recovered by country/region/state | Updated daily | [Johns Hopkins Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19/) | [See "Terms of Use" on CSSE repo](https://github.com/CSSEGISandData/COVID-19/) |
+| **UK Political Datasets** |
 | [**`general-election/UK/2010/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2010 General Election results | 6th May 2010 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) |
 | [**`general-election/UK/2015/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2015 General Election results | 7th May 2015 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) |
 | [**`general-election/UK/2017/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2017 General Election results | 8th June 2017 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) |

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -12,25 +12,25 @@ click==7.0                # via pip-tools
 decorator==4.4.1          # via ipython, traitlets
 importlib-metadata==0.23  # via pluggy, pytest
 ipython-genutils==0.2.0   # via traitlets
-ipython==7.9.0
+ipython==7.9.0            # via -r dev-requirements.in
 jedi==0.15.1              # via ipython
-more-itertools==7.2.0     # via pytest, zipp
+more-itertools==7.2.0     # via pytest
 packaging==19.2           # via pytest
 parso==0.5.1              # via jedi
 pexpect==4.7.0            # via ipython
 pickleshare==0.7.5        # via ipython
-pip-tools==4.2.0
+pip-tools==4.2.0          # via -r dev-requirements.in
 pluggy==0.13.0            # via pytest
 prompt-toolkit==2.0.10    # via ipython
 ptyprocess==0.6.0         # via pexpect
 py==1.8.0                 # via pytest
 pygments==2.4.2           # via ipython
 pyparsing==2.4.5          # via packaging
-pytest==5.2.2
-six==1.12.0               # via packaging, pip-tools, prompt-toolkit, traitlets
+pytest==5.2.2             # via -r dev-requirements.in
+six==1.12.0               # via -c requirements.txt, packaging, pip-tools, prompt-toolkit, traitlets
 traitlets==4.3.3          # via ipython
 wcwidth==0.1.7            # via prompt-toolkit, pytest
 zipp==0.6.0               # via importlib-metadata
 
 # The following packages are considered to be unsafe in a requirements file:
-# setuptools==45.1.0        # via ipython
+# setuptools
diff --git a/maven/datasets/coronavirus/README.md b/maven/datasets/coronavirus/README.md
@@ -0,0 +1,36 @@
+# Coronavirus (COVID-19) datasets
+
+If you have any questions about these datasets please [contact me @John_Sandall](https://twitter.com/John_Sandall) on Twitter.
+
+
+## Sources
+We aim to source our data directly from the most authorative data provider, falling back to less authorative sources where a primary source isn't available.
+
+Global providers/aggregators:
+- [Johns Hopkins Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19/).
+
+
+## Data dictionaries
+
+#### **`coronavirus/CSSE`**
+
+##### `CSSE_country_province.csv`
+| Column | Type | Description | Example |
+| -- | -- | -- | -- |
+| `date` | date | Date | `2020-03-13` |
+| `country_region` | str | Country/Region | `US` |
+| `province_state` | str | Province/State | `Washington` |
+| `lat` | float | Latitude | `47.4009` |
+| `lon` | float | Longitude | `-121.4905` |
+| `confirmed` | int | Confirmed cases | `568` |
+| `deaths` | int | Fatalities | `37` |
+| `recovered` | int | Recovered | `1` |
+
+##### `CSSE_country.csv`
+| Column | Type | Description | Example |
+| -- | -- | -- | -- |
+| `date` | date | Date | `2020-03-13` |
+| `country_region` | str | Country/Region | `US` |
+| `confirmed` | int | Confirmed cases | `2179` |
+| `deaths` | int | Fatalities | `47` |
+| `recovered` | int | Recovered | `12` |
diff --git a/maven/datasets/coronavirus/__init__.py b/maven/datasets/coronavirus/__init__.py
@@ -0,0 +1,5 @@
+from .csse import CSSE
+
+__all__ = [
+    "CSSE",
+]
diff --git a/maven/datasets/coronavirus/csse.py b/maven/datasets/coronavirus/csse.py
@@ -0,0 +1,119 @@
+"""
+Coronavirus CSSE data from https://github.com/CSSEGISandData/COVID-19/
+
+Usage:
+    >>> import maven
+    >>> maven.get('coronavirus/CSSE', data_directory='./data/')
+
+
+Sources:
+    - https://github.com/CSSEGISandData/COVID-19/
+"""
+import os
+from pathlib import Path
+
+import pandas as pd
+
+from maven import utils
+
+
+class CSSE(utils.Pipeline):
+    """Handle CSSE data from https://github.com/CSSEGISandData/COVID-19/"""
+
+    def __init__(self, directory=Path("data/coronavirus/CSSE")):
+        # inherit base __init__ but override default directory
+        super(CSSE, self).__init__(directory=directory)
+        # Source & targets
+        base_url = (
+            "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
+            "csse_covid_19_data/csse_covid_19_time_series/"
+        )
+        self.sources = [
+            # url, filename, checksum
+            (base_url, "time_series_19-covid-Confirmed.csv", "09b6dfc1ee244ba652b8639f0aa2f093"),
+            (base_url, "time_series_19-covid-Deaths.csv", "69a9dfa8a901c8f0bbe0f6499db8641c"),
+            (base_url, "time_series_19-covid-Recovered.csv", "4d1c1d4f1c45514e3562cb42ef2729c7"),
+        ]
+        self.targets = [
+            # filename, checksum(
+            ("CSSE_country_province.csv", "bfce6bf16571fbb3004f9e5eee7b9e30"),
+            ("CSSE_country.csv", "b5b3ed6fc75f323593fd7710a4262e1b"),
+        ]
+        # Config
+        self.rename_source = False
+        self.retrieve_all = True
+        self.cache = True
+        self.verbose = False
+        self.verbose_name = "CSSE"
+
+    def process(self):
+        """Process CSSE data."""
+        target_dir = self.directory / "processed"
+        os.makedirs(target_dir, exist_ok=True)  # create directory if it doesn't exist
+
+        def process_and_export():
+            """Either caching disabled or file not yet processed; process regardless."""
+            data = {}
+            for metric in ["Confirmed", "Deaths", "Recovered"]:
+                df = pd.read_csv(self.directory / "raw" / f"time_series_19-covid-{metric}.csv")
+                # Pivot all to long
+                id_vars = ["Province/State", "Country/Region", "Lat", "Long"]
+                value_vars = list(set(df.columns) - set(id_vars))
+                df = df.melt(
+                    id_vars=id_vars, value_vars=value_vars, var_name="date", value_name=metric
+                )
+                df["date"] = pd.to_datetime(df.date, format="%m/%d/%y")
+                data[metric] = df.copy()
+
+            # Merge together
+            df_country_province = pd.merge(
+                data["Confirmed"],
+                data["Deaths"],
+                how="outer",
+                on=["Province/State", "Country/Region", "Lat", "Long", "date"],
+            ).merge(
+                data["Recovered"],
+                how="outer",
+                on=["Province/State", "Country/Region", "Lat", "Long", "date"],
+            )
+
+            # Clean
+            df_country_province.columns = utils.sanitise(
+                df_country_province.columns, replace={"long": "lon"}
+            )
+            df_country_province = df_country_province[
+                [
+                    "date",
+                    "country_region",
+                    "province_state",
+                    "lat",
+                    "lon",
+                    "confirmed",
+                    "deaths",
+                    "recovered",
+                ]
+            ].sort_values(["date", "country_region", "province_state"])
+
+            # Country-level data
+            df_country = (
+                df_country_province.groupby(["date", "country_region"])[
+                    ["confirmed", "deaths", "recovered"]
+                ]
+                .sum()
+                .reset_index()
+            )
+
+            # Export
+            print(f"Exporting dataset to {target_dir.resolve()}")
+            df_country_province.to_csv(target_dir / "CSSE_country_province.csv", index=False)
+            df_country.to_csv(target_dir / "CSSE_country.csv", index=False)
+
+        for filename, checksum in self.targets:
+            utils.retrieve_from_cache_if_exists(
+                filename=filename,
+                target_dir=target_dir,
+                processing_fn=process_and_export,
+                md5_checksum=checksum,
+                caching_enabled=self.cache,
+                verbose=self.verbose,
+            )
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,7 @@ maven.egg-info/ @@
     # IDE ignores
     .vscode
+    # Checklists
+    DEPLOY
+    REVIEW