Source code for pandasdmx.writer.data2pandas

# pandaSDMX is licensed under the Apache 2.0 license a copy of which
# is included in the source distribution of pandaSDMX.
# This is notwithstanding any licenses of third-party software included in
# this distribution.
# (c) 2014, 2015 Dr. Leo <fhaxbox66qgmail.com>


'''
This module contains a writer class that writes a generic data message to
pandas dataframes or series.
'''


from pandasdmx.writer import BaseWriter
from pandasdmx.utils import concat_namedtuples
import pandas as PD
import numpy as NP


[docs]class Writer(BaseWriter):
[docs] def write(self, source=None, asframe=True, dtype=NP.float64, attributes='', reverse_obs=False, fromfreq=False, parse_time=True): '''Transfform a :class:`pandasdmx.model.DataMessage` instance to a pandas DataFrame or iterator over pandas Series. Args: source(pandasdmx.model.DataMessage): a pandasdmx.model.DataSet or iterator of pandasdmx.model.Series asframe(bool): if True, merge the series of values and/or attributes into one or two multi-indexed pandas.DataFrame(s), otherwise return an iterator of pandas.Series. (default: True) dtype(str, NP.dtype, None): datatype for values. Defaults to NP.float64 if None, do not return the values of a series. In this case, attributes must not be an empty string so that some attribute is returned. attributes(str, None): string determining which attributes, if any, should be returned in separate series or a separate DataFrame. Allowed values: '', 'o', 's', 'g', 'd' or any combination thereof such as 'os', 'go'. Defaults to ''. Where 'o', 's', 'g', and 'd' mean that attributes at observation, series, group and dataset level will be returned as members of per-observation namedtuples. reverse_obs(bool): if True, return observations in reverse order. Default: False fromfreq(bool): if True, extrapolate time periods from the first item and FREQ dimension. Default: False parse_time(bool): if True (default), try to generate datetime index, provided that dim_at_obs is 'TIME' or 'TIME_PERIOD'. Otherwise, ``parse_time`` is ignored. If False, always generate index of strings. Set it to False to increase performance and avoid parsing errors for exotic date-time formats unsupported by pandas. ''' # Preparations dim_at_obs = self.msg.header.dim_at_obs # validate 'attributes' if attributes is None or attributes == False: attributes = '' else: try: attributes = attributes.lower() except AttributeError: raise TypeError("'attributes' argument must be of type str.") if set(attributes) - {'o', 's', 'g', 'd'}: raise ValueError( "'attributes' must only contain 'o', 's', 'd' or 'g'.") with_obs_attr = 'o' in attributes # Allow source to be either an iterable or a model.DataSet instance if hasattr(source, '__iter__'): iter_series = source elif hasattr(source, 'series'): iter_series = source.series elif hasattr(source, 'data') and dim_at_obs != 'AllDimensions': iter_series = source.data.series # Is 'data' a flat dataset with just a list of obs? if dim_at_obs == 'AllDimensions': obs_zip = iter( zip(*source.data.obs(with_attributes=with_obs_attr))) dimensions = next(obs_zip) idx = PD.MultiIndex.from_tuples( dimensions, names=dimensions[0]._fields) if dtype: values_series = PD.Series( next(obs_zip), dtype=dtype, index=idx) if attributes: obs_attrib = NP.asarray(next(obs_zip), dtype='object') attrib_series = PD.Series( obs_attrib, dtype='object', index=idx) # Decide what to return if dtype and attributes: return values_series, attrib_series elif dtype: return values_series elif attributes: return attrib_series # So dataset has series: else: if asframe: series_list = list(s for s in self.iter_pd_series( iter_series, dim_at_obs, dtype, attributes, reverse_obs, fromfreq, parse_time)) if dtype and attributes: # series_list is actually a list of pairs of series # containing data and metadata respectively key_fields = series_list[0][0].name._fields pd_series, pd_attributes = zip(*series_list) elif dtype: key_fields = series_list[0].name._fields pd_series = series_list elif attributes: key_fields = series_list[0].name._fields pd_attributes = series_list if dtype: # Merge series into multi-indexed DataFrame and return it. d_frame = PD.concat(list(pd_series), axis=1, copy=False) d_frame.columns.set_names(key_fields, inplace=True) if attributes: a_frame = PD.concat(pd_attributes, axis=1, copy=False) a_frame.columns.set_names(key_fields, inplace=True) # decide what to return if dtype and attributes: return d_frame, a_frame elif dtype: return d_frame else: return a_frame # return an iterator else: return self.iter_pd_series(iter_series, dim_at_obs, dtype, attributes, reverse_obs, fromfreq, parse_time)
[docs] def iter_pd_series(self, iter_series, dim_at_obs, dtype, attributes, reverse_obs, fromfreq, parse_time): with_obs_attr = 'o' in attributes for series in iter_series: # Generate the 3 main columns: index, values and attributes obs_zip = list(zip(*series.obs(with_values=dtype, with_attributes=with_obs_attr, reverse_obs=reverse_obs))) # Are there observations at all? if obs_zip: obs_dim = obs_zip[0] obs_values = NP.array(obs_zip[1], dtype=dtype) obs_attrib = obs_zip[2] l = len(obs_dim) # Generate the index # Get frequency if present if 'FREQ' in series.key._fields: f = series.key.FREQ elif series.attrib and 'FREQUENCY' in series.attrib._fields: f = series.attrib.FREQUENCY elif 'FREQUENCY' in series.key._fields: f = series.key.FREQUENCY elif series.attrib and 'FREQ' in series.attrib._fields: f = series.attrib.FREQ else: f = None if parse_time and dim_at_obs == 'TIME_PERIOD': # First, handle half-yearly and bimonthly freqs # and format such as '2010-S1' format dim # pandas cannot parse those. So convert them if f == 'H': f = '2Q' # patch the dim values obs_dim = ['Q'.join((od[:-2], '1' if od[-1] == '1' else '3')) for od in obs_dim] # Check if we can build the index based on start and freq # Constructing the index from the first value and FREQ should only # occur if 'fromfreq' and hence f is True if fromfreq and f: # So there is a freq and we must use it series_index = PD.period_range(start=PD.Period(obs_dim[0], freq=f), periods=l, freq=f, name=dim_at_obs) else: # There is no ffreq or we must not use it. # So generate the index from all the obs dim values series_index = PD.PeriodIndex( (PD.Period(d, freq=f) for d in obs_dim), name=dim_at_obs) elif parse_time and dim_at_obs == 'TIME': if fromfreq and f: series_index = PD.date_range( start=PD.datetime(obs_dim[0]), periods=l, freq=f, name=dim_at_obs) else: series_index = PD.DatetimeIndex( (PD.datetime(d) for d in obs_dim), name=dim_at_obs) else: # Not a datetime or period index or don't parse it series_index = PD.Index(obs_dim, name=dim_at_obs) if dtype: value_series = PD.Series( obs_values, index=series_index, name=series.key) if attributes: # Assemble attributes of dataset, group and series if # needed gen_attrib = [attr for flag, attr in (('s', series.attrib), ('g', series.group_attrib), ('d', series.dataset.attrib)) if (flag in attributes) and attr] if gen_attrib: gen_attrib = concat_namedtuples(*gen_attrib) else: gen_attrib = None if 'o' in attributes: # concat with general attributes if any if gen_attrib: attrib_iter = (concat_namedtuples(a, gen_attrib, name='Attrib') for a in obs_attrib) else: # Simply take the obs attributes attrib_iter = obs_attrib else: # Make iterator yielding the constant general attribute set # It may be None. # for each obs attrib_iter = (gen_attrib for d in obs_attrib) attrib_series = PD.Series(attrib_iter, index=series_index, dtype='object', name=series.key) else: # There are no observations. So generate empty DataFrames if dtype: value_series = PD.Series(name=series.key) if attributes: attrib_series = PD.Series(name=series.key) # decide what to yield if dtype and attributes: yield value_series, attrib_series elif dtype: yield value_series elif attributes: yield attrib_series else: raise ValueError( "At least one of 'dtype' or 'attributes' args must be True.")