Source code for pandasdmx.writer.data2pandas

# pandaSDMX is licensed under the Apache 2.0 license a copy of which
# is included in the source distribution of pandaSDMX.
# This is notwithstanding any licenses of third-party software included in
# this distribution.
# (c) 2014, 2015 Dr. Leo <fhaxbox66qgmail.com>


'''
This module contains a writer class that writes a generic data message to
pandas dataframes or series.
'''


from pandasdmx.writer import BaseWriter
from pandasdmx.utils import concat_namedtuples
import pandas as PD
import numpy as NP


[docs]class Writer(BaseWriter):

[docs]    def write(self, source=None, asframe=True, dtype=NP.float64,
              attributes='', reverse_obs=False, fromfreq=False, parse_time=True):
        '''Transfform a :class:`pandasdmx.model.DataMessage` instance to a pandas DataFrame
        or iterator over pandas Series.

        Args:
            source(pandasdmx.model.DataMessage): a pandasdmx.model.DataSet or iterator 
                of pandasdmx.model.Series

            asframe(bool): if True, merge the series of values and/or attributes
                into one or two multi-indexed
                pandas.DataFrame(s), otherwise return an iterator of pandas.Series.
                (default: True)

            dtype(str, NP.dtype, None): datatype for values. Defaults to NP.float64
                if None, do not return the values of a series. In this case,
                attributes must not be an empty string so that some attribute is returned.

            attributes(str, None): string determining which attributes, if any,
                should be returned in separate series or a separate DataFrame.
                Allowed values: '', 'o', 's', 'g', 'd'
                or any combination thereof such as 'os', 'go'. Defaults to ''.
                Where 'o', 's', 'g', and 'd' mean that attributes at observation,
                series, group and dataset level will be returned as members of
                per-observation namedtuples.
            reverse_obs(bool): if True, return observations in 
                reverse order. Default: False
            fromfreq(bool): if True, extrapolate time periods 
                from the first item and FREQ dimension. Default: False
            parse_time(bool): if True (default), try to generate datetime index, provided that
                dim_at_obs is 'TIME' or 'TIME_PERIOD'. Otherwise, ``parse_time`` is ignored. If False,
                always generate index of strings. 
                Set it to False to increase performance and avoid 
                parsing errors for exotic date-time formats unsupported by pandas.
        '''

        # Preparations
        dim_at_obs = self.msg.header.dim_at_obs

        # validate 'attributes'
        if attributes is None or attributes == False:
            attributes = ''
        else:
            try:
                attributes = attributes.lower()
            except AttributeError:
                raise TypeError("'attributes' argument must be of type str.")
            if set(attributes) - {'o', 's', 'g', 'd'}:
                raise ValueError(
                    "'attributes' must only contain 'o', 's', 'd' or 'g'.")
        with_obs_attr = 'o' in attributes

        # Allow source to be either an iterable or a model.DataSet instance
        if hasattr(source, '__iter__'):
            iter_series = source
        elif hasattr(source, 'series'):
            iter_series = source.series
        elif hasattr(source, 'data') and dim_at_obs != 'AllDimensions':
            iter_series = source.data.series

        # Is 'data' a flat dataset with just a list of obs?
        if dim_at_obs == 'AllDimensions':
            obs_zip = iter(
                zip(*source.data.obs(with_attributes=with_obs_attr)))
            dimensions = next(obs_zip)
            idx = PD.MultiIndex.from_tuples(
                dimensions, names=dimensions[0]._fields)
            if dtype:
                values_series = PD.Series(
                    next(obs_zip), dtype=dtype, index=idx)
            if attributes:
                obs_attrib = NP.asarray(next(obs_zip), dtype='object')
                attrib_series = PD.Series(
                    obs_attrib, dtype='object', index=idx)
            # Decide what to return
            if dtype and attributes:
                return values_series, attrib_series
            elif dtype:
                return values_series
            elif attributes:
                return attrib_series

        # So dataset has series:
        else:
            if asframe:
                series_list = list(s for s in self.iter_pd_series(
                    iter_series, dim_at_obs, dtype, attributes,
                    reverse_obs, fromfreq, parse_time))
                if dtype and attributes:
                    # series_list is actually a list of pairs of series
                    # containing data and metadata respectively
                    key_fields = series_list[0][0].name._fields
                    pd_series, pd_attributes = zip(*series_list)
                elif dtype:
                    key_fields = series_list[0].name._fields
                    pd_series = series_list
                elif attributes:
                    key_fields = series_list[0].name._fields
                    pd_attributes = series_list

                if dtype:
                    # Merge series into multi-indexed DataFrame and return it.
                    d_frame = PD.concat(list(pd_series), axis=1, copy=False)
                    d_frame.columns.set_names(key_fields, inplace=True)

                if attributes:
                    a_frame = PD.concat(pd_attributes, axis=1, copy=False)
                    a_frame.columns.set_names(key_fields, inplace=True)
                # decide what to return
                if dtype and attributes:
                    return d_frame, a_frame
                elif dtype:
                    return d_frame
                else:
                    return a_frame

            # return an iterator
            else:
                return self.iter_pd_series(iter_series, dim_at_obs, dtype,
                                           attributes, reverse_obs, fromfreq, parse_time)

[docs]    def iter_pd_series(self, iter_series, dim_at_obs, dtype,
                       attributes, reverse_obs, fromfreq, parse_time):
        with_obs_attr = 'o' in attributes
        for series in iter_series:
            # Generate the 3 main columns: index, values and attributes
            obs_zip = list(zip(*series.obs(with_values=dtype,
                                           with_attributes=with_obs_attr, reverse_obs=reverse_obs)))
            # Are there observations at all?
            if obs_zip:
                obs_dim = obs_zip[0]
                obs_values = NP.array(obs_zip[1], dtype=dtype)
                obs_attrib = obs_zip[2]
                l = len(obs_dim)

                # Generate the index
                # Get frequency if present
                if 'FREQ' in series.key._fields:
                    f = series.key.FREQ
                elif series.attrib and 'FREQUENCY' in series.attrib._fields:
                    f = series.attrib.FREQUENCY
                elif 'FREQUENCY' in series.key._fields:
                    f = series.key.FREQUENCY
                elif series.attrib and 'FREQ' in series.attrib._fields:
                    f = series.attrib.FREQ
                else:
                    f = None

                if parse_time and dim_at_obs == 'TIME_PERIOD':
                    # First, handle half-yearly and bimonthly freqs
                    # and format such as '2010-S1' format dim
                    # pandas cannot parse those. So convert them
                    if f == 'H':
                        f = '2Q'
                        # patch the dim values
                        obs_dim = ['Q'.join((od[:-2], '1' if od[-1] == '1' else '3'))
                                   for od in obs_dim]
                    # Check if we can build the index based on start and freq
                    # Constructing the index from the first value and FREQ should only
                    # occur if 'fromfreq' and hence f is True
                    if fromfreq and f:  # So there is a freq and we must use it
                        series_index = PD.period_range(start=PD.Period(obs_dim[0], freq=f), periods=l,
                                                       freq=f, name=dim_at_obs)
                    else:
                        # There is no ffreq or we must not use it.
                        # So generate the index from all the obs dim values
                        series_index = PD.PeriodIndex(
                            (PD.Period(d, freq=f) for d in obs_dim), name=dim_at_obs)
                elif parse_time and dim_at_obs == 'TIME':
                    if fromfreq and f:
                        series_index = PD.date_range(
                            start=PD.datetime(obs_dim[0]), periods=l, freq=f, name=dim_at_obs)
                    else:
                        series_index = PD.DatetimeIndex(
                            (PD.datetime(d) for d in obs_dim),
                            name=dim_at_obs)
                else:
                    # Not a datetime or period index or don't parse it
                    series_index = PD.Index(obs_dim, name=dim_at_obs)

                if dtype:
                    value_series = PD.Series(
                        obs_values, index=series_index, name=series.key)

                if attributes:
                    # Assemble attributes of dataset, group and series if
                    # needed
                    gen_attrib = [attr
                                  for flag, attr in (('s', series.attrib),
                                                     ('g', series.group_attrib), ('d', series.dataset.attrib))
                                  if (flag in attributes) and attr]
                    if gen_attrib:
                        gen_attrib = concat_namedtuples(*gen_attrib)
                    else:
                        gen_attrib = None

                    if 'o' in attributes:
                        # concat with general attributes if any
                        if gen_attrib:
                            attrib_iter = (concat_namedtuples(a, gen_attrib,
                                                              name='Attrib') for a in obs_attrib)
                        else:
                            # Simply take the obs attributes
                            attrib_iter = obs_attrib
                    else:
                        # Make iterator yielding the constant general attribute set
                        # It may be None.
                        # for each obs
                        attrib_iter = (gen_attrib for d in obs_attrib)

                    attrib_series = PD.Series(attrib_iter,
                                              index=series_index, dtype='object', name=series.key)

            else:
                # There are no observations. So generate empty DataFrames
                if dtype:
                    value_series = PD.Series(name=series.key)
                if attributes:
                    attrib_series = PD.Series(name=series.key)

            # decide what to yield
            if dtype and attributes:
                yield value_series, attrib_series
            elif dtype:
                yield value_series
            elif attributes:
                yield attrib_series
            else:
                raise ValueError(
                    "At least one of 'dtype' or 'attributes' args must be True.")
Source code for pandasdmx.writer.data2pandas

pandaSDMX

Navigation

Related Topics