Source code for pandasdmx.writer.structure2pd

# pandaSDMX is licensed under the Apache 2.0 license a copy of which
# is included in the source distribution of pandaSDMX.
# This is notwithstanding any licenses of third-party software included in
# this distribution.
# (c) 2014, 2015, 2016 Dr. Leo <fhaxbox66qgmail.com>


'''
This module contains a writer class that writes artefacts from a StructureMessage to
pandas dataFrames. This is useful, e.g., to visualize
codes from a codelist or concepts from a concept scheme. The writer is more general though: It can
output any collection of nameable SDMX objects.
'''

from pandasdmx.utils import DictLike
from pandasdmx.writer import BaseWriter
import pandas as PD
import numpy as NP
from itertools import chain, repeat
from operator import attrgetter


[docs]class Writer(BaseWriter):

    _row_content = {'codelist', 'conceptscheme', 'dataflow',
                    'categoryscheme', 'provisionagreement'}

[docs]    def write(self, source=None, rows=None, **kwargs):
        '''
        Transfform structural metadata, i.e. codelists, concept-schemes,
        lists of dataflow definitions or category-schemes  
        from a :class:`pandasdmx.model.StructureMessage` instance into a pandas DataFrame.
        This method is called by :meth:`pandasdmx.api.Response.write` . It is not
        part of the public-facing API. Yet, certain kwargs are 
        propagated from there.

        Args:
            source(pandasdmx.model.StructureMessage): a :class:`pandasdmx.model.StructureMessage` instance.

            rows(str): sets the desired content 
                to be extracted from the StructureMessage.
                Must be a name of an attribute of the StructureMessage. The attribute must
                be an instance of `dict` whose keys are strings. These will be
                interpreted as ID's and used for the MultiIndex of the DataFrame
                to be returned. Values can be either instances of `dict` such as for codelists and categoryscheme, 
                or simple nameable objects
                such as for dataflows. In the latter case, the DataFrame will have a flat index.  
                (default: depends on content found in Message. 
                Common is 'codelist')
            columns(str, list): if str, it denotes the attribute of attributes of the
                values (nameable SDMX objects such as Code or ConceptScheme) that will be stored in the
                DataFrame. If a list, it must contain strings
                that are valid attibute values. Defaults to: ['name', 'description']
            constraint(bool): if True (default), apply any constraints to codelists, i.e. only the codes allowed by
                the constraints attached to the DSD, dataflow and provision agreements contained in the
                message are written to the DataFrame. Otherwise, the entire codelist
                is written.
            lang(str): locale identifier. Specifies the preferred 
                language for international strings such as names.
                Default is 'en'.
        '''

        # Set convenient default values for args
        # is rows a string?
        if rows is not None and not isinstance(rows, (list, tuple)):
            rows = [rows]
            return_df = True
        elif isinstance(rows, (list, tuple)) and len(rows) == 1:
            return_df = True
        else:
            return_df = False
        if rows is None:
            rows = [i for i in self._row_content if hasattr(source, i)]
        # Generate the DataFrame or -Frames and store them in a DictLike with
        # content-type names as keys
        frames = DictLike(
            {r: self._make_dataframe(source, r, **kwargs) for r in rows})
        if return_df:
            # There is only one item. So return the only value.
            return frames.any()
        else:
            return frames

    def _make_dataframe(self, source, rows, constraint=True,
                        columns=['name'], lang='en'):

        def make_column(scheme, item):
            if codelist_and_dsd:
                # scheme is a (dimension or attribute, codelist) pair
                dim_attr, scheme = scheme
            # first row of a scheme, DSD-less codelist, conceptscheme etc.
            if item is None:
                # take the column attributes from the scheme itself
                item = scheme
            raw = [getattr(item, s) for s in columns]
            # Select language for international strings represented as dict
            translated = [s[lang] if lang in s
                          else (s.get('en') or ((s or None) and s.any())) for s in raw]
            # for codelists, prepend dim_or_attr flag
            if codelist_and_dsd:
                if dim_attr in dim2cl:
                    translated.insert(0, 'D')
                else:
                    translated.insert(0, 'A')
            if len(translated) > 1:
                return tuple(translated)
            else:
                return translated[0]

        def iter_keys(container):
            if codelist_and_dsd:
                # ``container`` is a pair of (dimension, codelist)
                component, codelist = container
                if constraint and source._constrained_codes:
                    result = (codelist[v]
                              for v in source._constrained_codes[component.id])
                else:
                    result = codelist.values()
            else:
                result = container.values()
            return sorted(result, key=attrgetter('id'))

        def iter_schemes():
            if codelist_and_dsd:
                return chain(dim2cl.items(), attr2cl.items())
            else:
                return content.values()

        def container2id(container, item):
            if codelist_and_dsd:
                # For first index level, get dimension or attribute ID instead of
                # codelist ID
                container_id = container[0].id
                # 2nd index col: first row
                # contains the concept, all subsequent rows are codes.
                item_id = item.id
            else:
                # any other structure or codelist without DSD
                container_id = container.id
                item_id = item.id if item else None  # None in first row
            return container_id, item_id

        def row1_col2(container):
            if codelist_and_dsd:
                # return the concept of the dimension or attribute
                # instead of the (dim, codelist) pair
                return container[0].concept
            # all other cases: return None as there is nothing
            # interesting about, e.g. dataflow.
            return None

        if rows == 'codelist':
            # Assuming a msg contains only one DSD
            try:
                dsd = source.datastructure.any()
                # Relate dimensions and attributes to corresponding codelists to
                # show this relation in the resulting dataframe
                dim2cl = {d: d.local_repr.enum() for d in dsd.dimensions.values()
                          if d.local_repr.enum}
                attr2cl = {a: a.local_repr.enum() for a in dsd.attributes.values()
                           if a.local_repr.enum}
            except:
                dsd = None

        # pre-compute bool value to test for DSD-related codelists
        codelist_and_dsd = (rows == 'codelist' and dsd)

        # allow `columns` arg to be a str
        if not isinstance(columns, (list, tuple)):
            columns = [columns]
        # Get the structures to be written, e.g. codelist, dataflow,
        # conceptscheme
        content = getattr(source, rows)  # 'source' is the SDMX message
        # Distinguish hierarchical content consisting of a dict of dicts, and
        # flat content consisting of a dict of atomic model instances. In the former case,
        # the resulting DataFrame will have 2 index levels.
        if isinstance(content.any(), dict):
            # generate pairs of model instances, e.g. codelist
            # and code. Their structure resembles the multi-index
            # tuples. The model instances will be replaced
            # by their id-attributes later. For now
            # we need the model instances as we want to gleen
            # from them other attributes for the dataframe columns.
            raw_tuples = chain.from_iterable(zip(
                # 1st index level eg ID of dimension
                # represented by codelist, or ConceptScheme etc.
                repeat(container),
                # 2nd index level: first row in each codelist is the corresponding
                # container id. The following rows are item ID's. .
                chain((row1_col2(container),), iter_keys(container)))
                for container in iter_schemes())
            # Now actually generate the index and related data for the columns
            raw_idx, data = zip(*[(container2id(i, j),
                                   make_column(i, j))
                                  for i, j in raw_tuples])
            idx = PD.MultiIndex.from_tuples(raw_idx)  # set names?
        else:
            # flatt structure, e.g., dataflow definitions
            raw_tuples = sorted(content.values(), key=attrgetter('id'))
            raw_idx, data = zip(*((t.id, make_column(t, None))
                                  for t in raw_tuples))
            idx = PD.Index(raw_idx, name=rows)
        # For codelists, if there is a dsd, prepend 'dim_or_attr' as synthetic column
        # See corresponding insert in the make_columns function above
        if codelist_and_dsd:
            # make local copy to avoid side effect
            columns = columns[:]
            columns.insert(0, 'dim_or_attr')
        return PD.DataFrame(NP.array(data), index=idx, columns=columns)
Source code for pandasdmx.writer.structure2pd

pandaSDMX

Navigation

Related Topics