Source code for wbpy.climate

# -*- coding: utf-8 -*-
import re
import datetime
import pprint
import json
import itertools

import pycountry

from . import utils


class ClimateDataset(object):

    def __init__(self, api_calls, data_type, data_interval, call_date):
        """
        :param api_calls:
            List of dicts with the keys "url" and "resp". Necessary as multiple
            responses can form one dataset.

        :param data_type:
            eg. ``pr``, ``tas``, ``tmin_means`` 

        :param data_interval:
            eg. ``mavg``, ``decade``

        :param call_date:
            Date of the url call
        """
        self.api_call_date = call_date
        self.api_calls = api_calls

        self._data_type_arg = data_type
        self._interval_arg = data_interval

        for resp in self.api_calls:
            region = str(resp["url"].split("/")[-1])
            try:
                code = utils.convert_country_code(region.upper(), "alpha2")
                val = pycountry.countries.get(alpha2=code).name
            except KeyError:  # If not country code, assume it's a basin
                code = region
                val = "http://data.worldbank.org/sites/default/files"
                "/climate_data_api_basins.pdf"
            resp["region"] = (code, val)

    def __repr__(self):
        s = "<%s.%s(%r, %r) with id: %r>"
        return s % (
            self.__class__.__module__,
            self.__class__.__name__,
            self.data_type,
            self.interval,
            id(self),
            )

    def __str__(self):
        return pprint.pformat(self.as_dict())


[docs]class InstrumentalDataset(ClimateDataset):

    def __init__(self, *args, **kwargs):
        super(InstrumentalDataset, self).__init__(*args, **kwargs)

        dt = self._data_type_arg
        self.data_type = {dt: ClimateAPI._instrumental_types[dt]}
        self.interval = self._interval_arg

        self.dates = "data.worldbank.org/developers/climate-data-api: "\
            "Country averages are for 1901-2009, basin averages are for "\
            "1960-2009."

        if self.interval == "decade":
            self.dates += " For decadal requests, '1900' averages only 9 "\
                "years, as the year 1900 is not included. "

[docs]    def as_dict(self, use_datetime=False):
        """Return dataset data as dictionary.

        Keys are: data[location][date]

        :param use_datetime:
            Use datetime.date() objects for date keys, instead of strings.

        """
        results = {}

        if self.interval == "month":
            for call in self.api_calls:
                sorted_months = sorted(call["resp"],
                    key=lambda row: float(row["month"]))
                vals = [float(row["data"]) for row in sorted_months]

                region_code = call["region"][0]
                results[region_code] = vals
        else:
            for call in self.api_calls:
                region_code = call["region"][0]
                this_region = {}
                results[region_code] = this_region
                for row in call["resp"]:
                    if use_datetime:
                        key = utils.worldbank_date_to_datetime(
                            str(row["year"]))
                    else:
                        key = str(row["year"])
                    this_region[key] = float(row["data"])
        return results


[docs]class ModelledDataset(ClimateDataset):

    def __init__(self, *args, **kwargs):
        super(ModelledDataset, self).__init__(*args, **kwargs)

        dt = self._data_type_arg
        self.data_type = {dt: ClimateAPI._modelled_types[dt]}

        intv = self._interval_arg
        self.interval = {intv: ClimateAPI._modelled_intervals[intv]}

        self.gcms = {}
        for gcm_key in self.as_dict():
            if gcm_key in ClimateAPI._gcm:
                self.gcms[gcm_key] = ClimateAPI._gcm[gcm_key]

        all_sres = set()
        for call in self.api_calls:
            for row in call["resp"]:
                row_sres = row.get("scenario")
                if row_sres:
                    all_sres.add(row_sres)
        self.sres = list(all_sres)

        if self.data_type in ["pr", "tas"]:
            self.control_period = ("1961", "1999")
        else:
            self.control_period = ("1961", "2000")

[docs]    def dates(self, use_datetime=False):
        """Return dataset date start/end pairs.

        :param use_datetime:
            If True, return dates as datetime.date() object instead of strings.

        """
        dates = set()
        all_urls = [call["url"] for call in self.api_calls]
        for url in all_urls:
            start, end = re.findall("\d+/\d+", url)[0].split("/")
            if use_datetime:
                start = utils.worldbank_date_to_datetime(start)
                end = utils.worldbank_date_to_datetime(end)
            dates.add((start, end))
        return sorted(list(dates))

[docs]    def as_dict(self, sres="a2", use_datetime=False):
        """Return dataset data as dictionary.

        Keys are: data[gcm][location][date]

        :param sres:
            Which SRES to use for future values. The API supports A2 and B1,
            although not all GCMs have data for both.

        :param use_datetime:
            Use datetime.date() objects for date keys, instead of strings.

        """
        results = {}
        for call in self.api_calls:
            if "ensemble" in call["url"]:
                get_gcm_key = lambda row: "ensemble_%d" % row["percentile"]
                annual_data_key = "annualVal"
            else:
                get_gcm_key = lambda row: row["gcm"]
                annual_data_key = "annualData"

            region_code = call["region"][0]

            for row in call["resp"]:
                # Only future calls have scenarios. Limit results to one
                # scenario at a time, so we can have one value per time
                # period.
                row_scenario = row.get("scenario")
                if row_scenario and row_scenario != sres.lower():
                    continue

                gcm_key = get_gcm_key(row)
                if gcm_key not in results:
                    results[gcm_key] = {}

                if region_code not in results[gcm_key]:
                    results[gcm_key][region_code] = {}
                region_dict = results[gcm_key][region_code]

                year = str(row["toYear"])
                if use_datetime:
                    year = utils.worldbank_date_to_datetime(year)

                if year not in region_dict:
                    if "annual" in call["url"]:
                        val = float(row[annual_data_key][0])
                    else:
                        # Assume they are monthly values
                        val = row["monthVals"]
                    region_dict[year] = val

        return results


[docs]class ClimateAPI(object):

    """Request data from the World Bank Climate API. 
    
    You can override the default tempfile cache by passing a function
    ``fetch``, which requests a URL and returns the response as a string. 
    """

    _gcm = dict(
        bccr_bcm2_0="BCM 2.0",
        csiro_mk3_5="CSIRO Mark 3.5",
        ingv_echam4="ECHAM 4.6",
        cccma_cgcm3_1="CGCM 3.1 (T47)",
        cnrm_cm3="CNRM CM3",
        gfdl_cm2_0="GFDL CM2.0",
        gfdl_cm2_1="GFDL CM2.1",
        ipsl_cm4="IPSL-CM4",
        microc3_2_medres="MIROC 3.2 (medres)",
        miub_echo_g="ECHO-G",
        mpi_echam5="ECHAM5/MPI-OM",
        mri_cgcm2_3_2a="MRI-CGCM2.3.2",
        inmcm3_0="INMCM3.0",
        ukmo_hadcm3="UKMO HadCM3",
        ukmo_hadgem1="UKMO HadGEM1",
        ensemble="All percentile values of all models together",
        ensemble_10="10th percentile values of all models together",
        ensemble_50="50th percentile values of all models together",
        ensemble_90="90th percentile values of all models together",
        )

    _valid_modelled_dates = [
        (1920, 1939),
        (1940, 1959),
        (1960, 1979),
        (1980, 1999),
        (2020, 2039),
        (2040, 2059),
        (2060, 2079),
        (2080, 2099),
        ]

    _valid_stat_dates = [
        (1961, 2000),
        (2046, 2065),
        (2081, 2100),
        ]

    _instrumental_types = dict(
        pr="Precipitation (rainfall and assumed water equivalent), in "
        "millimeters",
        tas="Temperature, in degrees Celsius",
        )

    _instrumental_intervals = ["year", "month", "decade"]

    _modelled_types = dict(
        tmin_means="Average daily minimum temperature, Celsius",
        tmax_means="Average daily maximum temperature, Celsius",
        tmax_days90th="Number of days with max temperature above the "
        "control period's 90th percentile (hot days)",
        tmin_days90th="Number of days with min temperature above the "
        "control period's 90th percentile (warm nights)",
        tmax_days10th="Number of days with max temperature below the "
        "control period's 10th percentile (cool days)",
        tmin_days10th="Number of days with min temperature below the "
        "control period's 10th percentile (cold nights)",
        tmin_days0="Number of days with min temperature below "
        "0 degrees Celsius",
        ppt_days="Number of days with precipitation > 0.2mm",
        ppt_days2="Number of days with precipitation > 2mm",
        ppt_days10="Number of days with precipitation > 10mm",
        ppt_days90th="Number of days with precipitation > the control "
        "period's 90th percentile",
        ppt_dryspell="Average number of days between precipitation "
        "events",
        ppt_means="Average daily precipitation",
        pr=_instrumental_types["pr"],
        tas=_instrumental_types["tas"],
        )

    _modelled_intervals = dict(
        mavg="Monthly average",
        annualavg="Annual average",
        manom="Average monthly change (anomaly).",
        annualanom="Average annual change (anomaly).",
        )

    # Convenience codes
    _shorthand_codes = dict(
        aanom="annualanom",
        aavg="annualavg",
        )
    for _k, _d_key in _shorthand_codes.iteritems():
        for _d in [_instrumental_types, _modelled_types,
            _instrumental_intervals, _modelled_intervals]:
            if _d_key in _d:
                _d[_k] = _d[_d_key]

    # Make them accessible via single attr
    ARG_DEFINITIONS = dict(
        instrumental_types=_instrumental_types,
        instrumental_intervals=_instrumental_intervals,
        modelled_types=_modelled_types,
        modelled_intervals=_modelled_intervals,
        )

    BASE_URL = "http://climatedataapi.worldbank.org/climateweb/rest/"

    def __init__(self, fetch=None):
        self.fetch = fetch if fetch else utils.fetch

    @staticmethod
    def _clean_api_code(code):
        code = code.lower()
        return ClimateAPI._shorthand_codes.get(code, code)

[docs]    def get_instrumental(self, data_type, interval, locations):
        """Get historical data for temperature or precipitation.

        :param data_type:
            Either ``pr`` for precipitation, or ``tas`` for temperature.

        :param interval:
            Either ``year``, ``month`` or ``decade``.

        :param locations:
            A list of API location codes - either ISO alpha-2 or alpha-3
            country codes, or basin ID numbers.

        """
        data_type = self._clean_api_code(data_type)
        interval = self._clean_api_code(interval)

        assert data_type in self.ARG_DEFINITIONS["instrumental_types"]
        assert interval in self.ARG_DEFINITIONS["instrumental_intervals"]

        # Construct URLs
        urls = []
        for loc in locations:
            try:
                int(loc)
                loc_type = "basin"
            except ValueError:
                loc = utils.convert_country_code(loc, "alpha3")
                loc_type = "country"

            data_url = "v1/{0}/cru/{1}/{2}/{3}".format(loc_type, data_type,
                interval, str(loc))
            full_url = "".join([self.BASE_URL, data_url])
            urls.append((loc, full_url))

        # If no exception from URL construction, make requests
        api_calls = []
        for loc, url in urls:
            resp = json.loads(self.fetch(url))
            api_calls.append(dict(
                url=url,
                resp=resp,
                ))

        call_date = datetime.datetime.now().date()
        return InstrumentalDataset(api_calls, data_interval=interval,
            data_type=data_type, call_date=call_date)

[docs]    def get_modelled(self, data_type, interval, locations):
        """Get modelled data for precipitation or temperature.

        :param data_type:
            The data statistic ID. See
            ``self.ARG_DEFINITIONS["modelled_types"]`` for IDs and values.

        :param interval:
            The interval ID. See ``self.ARG_DEFINITIONS["modelled_intervals"]``
            for IDs and values.

        :param locations:
            A list of API location codes - either ISO alpha-2 or alpha-3
            country codes, or basin ID numbers.

        """
        data_type = self._clean_api_code(data_type)
        interval = self._clean_api_code(interval)

        assert data_type in self.ARG_DEFINITIONS["modelled_types"]
        assert interval in self.ARG_DEFINITIONS["modelled_intervals"]

        # As there aren't many variants of each data type, it's simplest to
        # always call both GCM and ensemble data, for all dates, and not offer
        # any filtering options.

        # Derivived statistic requests are all of the "ensemble" kind, and they
        # have a different set of dates to GCM requests.
        if data_type in ["pr", "tas"]:
            all_urls = ["v1/{0}/{1}/{2}/{3}/{4}/{5}",
                "v1/{0}/{1}/ensemble/{2}/{3}/{4}/{5}"]
            all_dates = self._valid_modelled_dates
        else:
            all_urls = ["v1/{0}/{1}/ensemble/{2}/{3}/{4}/{5}"]
            all_dates = self._valid_stat_dates

        api_calls = []
        for loc in locations:
            try:
                int(loc)  # basin ids are ints
                loc_type = "basin"
            except ValueError:
                loc = utils.convert_country_code(loc, "alpha3")
                loc_type = "country"

            for dates, url in itertools.product(all_dates, all_urls):
                start_date = dates[0]
                end_date = dates[1]
                rest_url = url.format(loc_type, interval, data_type,
                    start_date, end_date, loc)
                full_url = "".join([self.BASE_URL, rest_url])

                resp = json.loads(self.fetch(full_url))
                api_calls.append(dict(
                    url=full_url,
                    resp=resp,
                    ))

        call_date = datetime.datetime.now().date()
        return ModelledDataset(api_calls, data_interval=interval,
            data_type=data_type, call_date=call_date)