Source code for pandasdmx.writer.structure2pd

# pandaSDMX is licensed under the Apache 2.0 license a copy of which
# is included in the source distribution of pandaSDMX.
# This is notwithstanding any licenses of third-party software included in
# this distribution.
# (c) 2014, 2015, 2016 Dr. Leo <fhaxbox66qgmail.com>


'''
This module contains a writer class that writes artefacts from a StructureMessage to
pandas dataFrames. This is useful, e.g., to visualize
codes from a codelist or concepts from a concept scheme. The writer is more general though: It can
output any collection of nameable SDMX objects.
'''

from pandasdmx.utils import DictLike
from pandasdmx.writer import BaseWriter
import pandas as PD
import numpy as NP
from itertools import chain, repeat
from operator import attrgetter


[docs]class Writer(BaseWriter): _row_content = {'codelist', 'conceptscheme', 'dataflow', 'categoryscheme', 'provisionagreement'}
[docs] def write(self, source=None, rows=None, **kwargs): ''' Transfform structural metadata, i.e. codelists, concept-schemes, lists of dataflow definitions or category-schemes from a :class:`pandasdmx.model.StructureMessage` instance into a pandas DataFrame. This method is called by :meth:`pandasdmx.api.Response.write` . It is not part of the public-facing API. Yet, certain kwargs are propagated from there. Args: source(pandasdmx.model.StructureMessage): a :class:`pandasdmx.model.StructureMessage` instance. rows(str): sets the desired content to be extracted from the StructureMessage. Must be a name of an attribute of the StructureMessage. The attribute must be an instance of `dict` whose keys are strings. These will be interpreted as ID's and used for the MultiIndex of the DataFrame to be returned. Values can be either instances of `dict` such as for codelists and categoryscheme, or simple nameable objects such as for dataflows. In the latter case, the DataFrame will have a flat index. (default: depends on content found in Message. Common is 'codelist') columns(str, list): if str, it denotes the attribute of attributes of the values (nameable SDMX objects such as Code or ConceptScheme) that will be stored in the DataFrame. If a list, it must contain strings that are valid attibute values. Defaults to: ['name', 'description'] constraint(bool): if True (default), apply any constraints to codelists, i.e. only the codes allowed by the constraints attached to the DSD, dataflow and provision agreements contained in the message are written to the DataFrame. Otherwise, the entire codelist is written. lang(str): locale identifier. Specifies the preferred language for international strings such as names. Default is 'en'. ''' # Set convenient default values for args # is rows a string? if rows is not None and not isinstance(rows, (list, tuple)): rows = [rows] return_df = True elif isinstance(rows, (list, tuple)) and len(rows) == 1: return_df = True else: return_df = False if rows is None: rows = [i for i in self._row_content if hasattr(source, i)] # Generate the DataFrame or -Frames and store them in a DictLike with # content-type names as keys frames = DictLike( {r: self._make_dataframe(source, r, **kwargs) for r in rows}) if return_df: # There is only one item. So return the only value. return frames.any() else: return frames
def _make_dataframe(self, source, rows, constraint=True, columns=['name'], lang='en'): def make_column(scheme, item): if codelist_and_dsd: # scheme is a (dimension or attribute, codelist) pair dim_attr, scheme = scheme # first row of a scheme, DSD-less codelist, conceptscheme etc. if item is None: # take the column attributes from the scheme itself item = scheme raw = [getattr(item, s) for s in columns] # Select language for international strings represented as dict translated = [s[lang] if lang in s else (s.get('en') or ((s or None) and s.any())) for s in raw] # for codelists, prepend dim_or_attr flag if codelist_and_dsd: if dim_attr in dim2cl: translated.insert(0, 'D') else: translated.insert(0, 'A') if len(translated) > 1: return tuple(translated) else: return translated[0] def iter_keys(container): if codelist_and_dsd: # ``container`` is a pair of (dimension, codelist) component, codelist = container if constraint and source._constrained_codes: result = (codelist[v] for v in source._constrained_codes[component.id]) else: result = codelist.values() else: result = container.values() return sorted(result, key=attrgetter('id')) def iter_schemes(): if codelist_and_dsd: return chain(dim2cl.items(), attr2cl.items()) else: return content.values() def container2id(container, item): if codelist_and_dsd: # For first index level, get dimension or attribute ID instead of # codelist ID container_id = container[0].id # 2nd index col: first row # contains the concept, all subsequent rows are codes. item_id = item.id else: # any other structure or codelist without DSD container_id = container.id item_id = item.id if item else None # None in first row return container_id, item_id def row1_col2(container): if codelist_and_dsd: # return the concept of the dimension or attribute # instead of the (dim, codelist) pair return container[0].concept # all other cases: return None as there is nothing # interesting about, e.g. dataflow. return None if rows == 'codelist': # Assuming a msg contains only one DSD try: dsd = source.datastructure.any() # Relate dimensions and attributes to corresponding codelists to # show this relation in the resulting dataframe dim2cl = {d: d.local_repr.enum() for d in dsd.dimensions.values() if d.local_repr.enum} attr2cl = {a: a.local_repr.enum() for a in dsd.attributes.values() if a.local_repr.enum} except: dsd = None # pre-compute bool value to test for DSD-related codelists codelist_and_dsd = (rows == 'codelist' and dsd) # allow `columns` arg to be a str if not isinstance(columns, (list, tuple)): columns = [columns] # Get the structures to be written, e.g. codelist, dataflow, # conceptscheme content = getattr(source, rows) # 'source' is the SDMX message # Distinguish hierarchical content consisting of a dict of dicts, and # flat content consisting of a dict of atomic model instances. In the former case, # the resulting DataFrame will have 2 index levels. if isinstance(content.any(), dict): # generate pairs of model instances, e.g. codelist # and code. Their structure resembles the multi-index # tuples. The model instances will be replaced # by their id-attributes later. For now # we need the model instances as we want to gleen # from them other attributes for the dataframe columns. raw_tuples = chain.from_iterable(zip( # 1st index level eg ID of dimension # represented by codelist, or ConceptScheme etc. repeat(container), # 2nd index level: first row in each codelist is the corresponding # container id. The following rows are item ID's. . chain((row1_col2(container),), iter_keys(container))) for container in iter_schemes()) # Now actually generate the index and related data for the columns raw_idx, data = zip(*[(container2id(i, j), make_column(i, j)) for i, j in raw_tuples]) idx = PD.MultiIndex.from_tuples(raw_idx) # set names? else: # flatt structure, e.g., dataflow definitions raw_tuples = sorted(content.values(), key=attrgetter('id')) raw_idx, data = zip(*((t.id, make_column(t, None)) for t in raw_tuples)) idx = PD.Index(raw_idx, name=rows) # For codelists, if there is a dsd, prepend 'dim_or_attr' as synthetic column # See corresponding insert in the make_columns function above if codelist_and_dsd: # make local copy to avoid side effect columns = columns[:] columns.insert(0, 'dim_or_attr') return PD.DataFrame(NP.array(data), index=idx, columns=columns)