Source code for pandasdmx.reader.sdmxml

"""SDMXML v2.1 reader."""
# See comments on the Reader() class for implementation details
from collections import defaultdict
from copy import copy
from inspect import isclass
from itertools import chain
import logging
import re

from lxml import etree
from lxml.etree import QName, XPath

from pandasdmx.exceptions import ParseError, XMLParseError
from pandasdmx.message import (
    DataMessage, ErrorMessage, Footer, Header, StructureMessage,
    )
import pandasdmx.model
from pandasdmx.model import (  # noqa: F401
    DEFAULT_LOCALE, Agency, AgencyScheme, AllDimensions, Annotation,
    AttributeDescriptor, NoSpecifiedRelationship, PrimaryMeasureRelationship,
    DimensionRelationship, AttributeValue, Categorisation, Category,
    CategoryScheme, Code, Codelist, ComponentValue, Concept, ConceptScheme,
    Contact, ContentConstraint, ConstraintRole, ConstraintRoleType, CubeRegion,
    DataAttribute, DataflowDefinition, DataKey, DataKeySet, DataProvider,
    DataProviderScheme, DataSet, DataStructureDefinition, Dimension,
    DimensionDescriptor, Facet, FacetValueType, GroupDimensionDescriptor,
    GroupKey, IdentifiableArtefact, InternationalString, ItemScheme,
    MaintainableArtefact, MeasureDescriptor, MeasureDimension, MemberSelection,
    MemberValue, Key, Observation, PrimaryMeasure, ProvisionAgreement,
    Representation, SeriesKey, TimeDimension, UsageStatus,
    )

from pandasdmx.reader import BaseReader


log = logging.getLogger(__name__)


# Regular expression for URNs used as references
URN = re.compile(r'urn:sdmx:org\.sdmx\.infomodel'
                 r'\.(?P<package>[^\.]*)'
                 r'\.(?P<class>[^=]*)=((?P<agency>[^:]*):)?'
                 r'(?P<id>[^\(\.]*)(\((?P<version>[\d\.]*)\))?'
                 r'(\.(?P<item_id>.*))?')


# XML namespaces
_base_ns = 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1'
NS = {
    'com': f'{_base_ns}/common',
    'data': f'{_base_ns}/data/structurespecific',
    'str': f'{_base_ns}/structure',
    'mes': f'{_base_ns}/message',
    'gen': f'{_base_ns}/data/generic',
    'footer': f'{_base_ns}/message/footer',
    'xml': 'http://www.w3.org/XML/1998/namespace',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
    }


def qname(ns, name):
    """Return a fully-qualified tag *name* in namespace *ns*."""
    return QName(NS[ns], name)


_TO_SNAKE_RE = re.compile('([A-Z]+)')


def to_snake(value):
    """Convert *value* from lowerCamelCase to snake_case."""
    return _TO_SNAKE_RE.sub(r'_\1', value).lower()


# Mapping tag names → Message classes
MESSAGE = {qname('mes', name): cls for name, cls in (
    ('Structure', StructureMessage),
    ('GenericData', DataMessage),
    ('GenericTimeSeriesData', DataMessage),
    ('StructureSpecificData', DataMessage),
    ('StructureSpecificTimeSeriesData', DataMessage),
    ('Error', ErrorMessage),
    )}


# XPath expressions for parse_header()
HEADER_XPATH = {key: XPath(expr, namespaces=NS, smart_strings=False) for
                key, expr in (
    ('id', 'mes:ID/text()'),
    ('prepared', 'mes:Prepared/text()'),
    ('sender', 'mes:Sender/@*'),
    ('receiver', 'mes:Receiver/@*'),
    ('structure_id', 'mes:Structure/@structureID'),
    ('dim_at_obs', 'mes:Structure/@dimensionAtObservation'),
    # 'Structure' vs 'StructureUsage' varies across XML specimens.
    ('structure_ref_id', '(mes:Structure/com:Structure/Ref/@id | '
                         'mes:Structure/com:StructureUsage/Ref/@id)[1]'),
    ('structure_ref_agencyid', '(mes:Structure/com:Structure/Ref/@agencyID | '
                               'mes:Structure/com:StructureUsage/Ref/'
                               '@agencyID)[1]'),
    ('structure_ref_version', '(mes:Structure/com:Structure/Ref/@version | '
                              'mes:Structure/com:StructureUsage/Ref/'
                              '@version)[1]'),
    ('structure_ref_urn', 'mes:Structure/com:Structure/URN/text()'),
    )}


# For Reader._parse(): tag name → Reader.parse_[…] method to use
# TODO make this data structure more compact/avoid repetition
METHOD = {
    'AnnotationText': 'international_string',
    'Name': 'international_string',
    'Department': 'international_string',
    'Description': 'international_string',
    'Role': 'international_string',
    'Text': 'international_string',

    'DimensionList': 'componentlist',
    'AttributeList': 'componentlist',
    'MeasureList': 'componentlist',

    'CoreRepresentation': 'representation',
    'LocalRepresentation': 'representation',

    'AnnotationType': 'text',
    'AnnotationTitle': 'text',
    'AnnotationURL': 'text',
    'Email': 'text',
    'None': 'text',
    'Telephone': 'text',
    'URI': 'text',
    'URN': 'text',
    'Value': 'text',

    'ObsKey': 'key',
    'SeriesKey': 'key',
    'GroupKey': 'key',

    'AgencyScheme': 'orgscheme',
    'DataproviderScheme': 'orgscheme',

    'Agency': 'organisation',
    'DataProvider': 'organisation',

    'KeyValue': 'memberselection',

    'TextFormat': 'facet',
    'EnumerationFormat': 'facet',

    'MeasureDimension': 'dimension',
    'TimeDimension': 'dimension',

    # Tags that are bare containers for other XML elements; skip entirely
    'Annotations': 'SKIP',
    'CategorySchemes': 'SKIP',
    'Categorisations': 'SKIP',
    'Codelists': 'SKIP',
    'Concepts': 'SKIP',
    'Constraints': 'SKIP',
    'Dataflows': 'SKIP',
    'DataStructures': 'SKIP',
    'DataStructureComponents': 'SKIP',
    'Footer': 'SKIP',
    'OrganisationSchemes': 'SKIP',
    'ProvisionAgreements': 'SKIP',

    # Tag names that only ever contain references
    'AttachmentGroup': 'ref',
    'DimensionReference': 'ref',
    'Parent': 'ref',
    'Source': 'ref',
    'Structure': 'ref',  # str:Structure, not mes:Structure
    'Target': 'ref',
    # 'ConstraintAttachment': 'ref',
    'Enumeration': 'ref',
    }


# Mappings from SDMX-ML 'package' to contained classes
PACKAGE_CLASS = {
    'base': {Agency, AgencyScheme, DataProvider},
    'categoryscheme': {Category, Categorisation, CategoryScheme},
    'codelist': {Code, Codelist},
    'conceptscheme': {Concept, ConceptScheme},
    'datastructure': {DataflowDefinition, DataStructureDefinition},
    'registry': {ContentConstraint, ProvisionAgreement},
    }


def get_class(package, cls):
    """Return a class object for string *cls* and *package* names."""
    if isinstance(cls, str):
        if cls in 'Dataflow DataStructure':
            cls += 'Definition'
        cls = getattr(pandasdmx.model, cls)

    assert cls in PACKAGE_CLASS[package], \
        f'Package {package!r} invalid for {cls}'

    return cls


def wrap(value):
    """Return *value* as a list.

    Reader._parse(elem, unwrap=True) returns single children of *elem* as bare
    objects. wrap() ensures they are a list.
    """
    return value if isinstance(value, list) else [value]


def add_localizations(target, values):
    """Add localized strings from *values* to *target*."""
    if isinstance(values, tuple) and len(values) == 2:
        values = [values]
    target.localizations.update({locale: label for locale, label in values})


class Reparse(Exception):
    """Raised for a forward reference to trigger reparsing."""
    pass


# Reader operates by recursion through the _parse() method:
#
# - _parse(elem) uses the XML tag name of elem, plus METHOD, to find a method
#    like Reader.parse_X().
# - parse_X(elem) is called. These methods perform similar tasks such as:
#   - Create an instance of a pandasdmx.model class,
#   - Recursively:
#     - call _parse() on the children of elem,
#     - call _named(), which also creates an instance of a NameableArtefact,
#   - Handle the returned values (i.e. parsed XML child elements) and attach
#     them to the model object,
#   - Handle the XML attributes of elem and attach these to the model object,
#   - ``assert len(values) == 0`` or similar to assert that all parsed child
#     elements and/or attributes have been consumed,
#   - Return the parsed model object to be used further up the recursive stack.
#
[docs]class Reader(BaseReader):
    """Read SDMX-ML 2.1 and expose it as instances from :mod:`pandasdmx.model`.

    The implementation is recursive, and depends on:

    - :meth:`_parse`, :meth:`_named` and :meth:`_maintained`.
    - State variables :attr:`_current`, :attr:`_stack, :attr:`_index`.

    Parameters
    ----------
    dsd : :class:`~.DataStructureDefinition`
        For “structure-specific” `format`=``XML`` messages only.
    """
    # State variables for reader

    # Stack (0 = top) of tag names being parsed by _parse().
    # Tag parsers may examine the stack to determine context for parsing.
    _stack = []

    # Map of (class name, id) → pandasdmx.model object.
    # Only IdentifiableArtefacts should be stored. See _maintained().
    _index = {}

    # Similar to _index, but specific to the current scope.
    _current = {}

[docs]    def read_message(self, source, dsd=None):
        # Root XML element
        root = etree.parse(source).getroot()

        # Message class
        try:
            cls = MESSAGE[root.tag]
        except KeyError:
            msg = 'Unrecognized message root element {!r}'.format(root.tag)
            raise ParseError(msg) from None

        # Reset state
        self._stack = []
        self._index = {}
        self._current = {}

        # With 'dsd' argument, the message should be structure-specific
        if dsd is not None:
            if 'StructureSpecific' not in root.tag:
                log.warning('Ambiguous: dsd= argument for non-structure-'
                            'specific message')
            self._index[('DataStructureDefinition', dsd.id)] = dsd

        # Parse the tree
        values = self._parse(root)

        # Instantiate the message object
        msg = cls()

        # Store the header
        header = values.pop('header', None)
        if header is None and 'errormessage' in values:
            # An error message
            msg.header = Header()

            # Error message attributes resemble footer attributes
            values['footer'] = Footer(**values.pop('errormessage'))
        elif len(header) == 2:
            # Length-2 list includes DFD/DSD reference
            msg.header, msg.dataflow = header
            msg.observation_dimension = self._obs_dim
        else:
            # No DFD in the header, e.g. for a StructureMessage
            msg.header = header[0]

        # Store the footer
        msg.footer = values.pop('footer', None)

        # Finalize according to the message type
        if cls is DataMessage:
            # Simply store the datasets
            msg.data.extend(wrap(values.pop('dataset', [])))
        elif cls is StructureMessage:
            structures = values.pop('structures')

            # Populate dictionaries by ID
            for attr, name in (
                    ('dataflow', 'dataflows'),
                    ('codelist', 'codelists'),
                    ('constraint', 'constraints'),
                    ('structure', 'datastructures'),
                    ('category_scheme', 'categoryschemes'),
                    ('concept_scheme', 'concepts'),
                    ('organisation_scheme', 'organisationschemes'),
                    ('provisionagreement', 'provisionagreements'),
                    ):
                for obj in structures.pop(name, []):
                    getattr(msg, attr)[obj.id] = obj

            # Check, but do not store, Categorisations

            # Assemble a list of external categoryschemes
            ext_cs = []
            for key, cs in self._index.items():
                if key[0] == 'CategoryScheme' and cs.is_external_reference:
                    ext_cs.append(cs)

            for c in structures.pop('categorisations', []):
                if not isinstance(c.artefact, DataflowDefinition):
                    continue
                assert c.artefact in msg.dataflow.values()

                missing_cs = True
                for cs in chain(msg.category_scheme.values(), ext_cs):
                    if c.category in cs:
                        missing_cs = False
                        if cs.is_external_reference:
                            # Store the externally-referred CategoryScheme
                            msg.category_scheme[cs.id] = cs
                        break

                assert not missing_cs

            assert len(structures) == 0, structures

        assert len(values) == 0, values
        return msg

    def _parse(self, elem, unwrap=True):
        """Recursively parse the XML *elem* and return pandasdmx.model objects.

        Methods like 'Reader.parse_attribute()' are called for XML elements
        with tag names like '<ns:Attribute>'; each emits pandasdmx.model
        objects.

        If *unwrap* is True (the default), then single-entry lists are returned
        as bare objects.
        """
        # Container for results
        results = defaultdict(list)

        # Store state: tag name for the elem
        self._stack.append(QName(elem).localname)

        # Parse each child
        reparse = []  # Elements to reparse after the first pass
        reparse_limit = 2 * len(elem)
        for i, child in enumerate(chain(elem, reparse)):
            if i > reparse_limit:
                # Probably repeated failure to parse the same elements, which
                # would lead to an infinite loop
                raise ValueError(f'Unable to parse elements {reparse!r}')

            # Tag name for the child
            tag_name = QName(child).localname

            # Invoke the parser for this element
            try:
                # Get the name of the parser method
                how = METHOD.get(tag_name, tag_name)

                if how == 'SKIP':
                    # Element is a bare container for other elements; parse its
                    # children directly
                    result = list(chain(*self._parse(child, unwrap=False)
                                        .values()))
                elif how == 'ref' or (len(child) == 1 and
                                      child[0].tag == 'Ref'):
                    # Element contains a reference
                    # Parse the reference; may raise Reparse (below)
                    result = [self.parse_ref(child[0], parent=tag_name)]
                else:
                    # All other elements
                    result = [getattr(self, f'parse_{how}'.lower())(child)]
            except Reparse as r:
                # Raise one level beyond the original to reparse <Parent><Ref>
                # instead of <Ref>
                if r.args[0] < 1:
                    self._stack.pop()
                    raise Reparse(r.args[0] + 1)

                # Add to the queue to be reparsed on the second pass
                reparse.append(child)

                # Continue with next child element
                continue
            except XMLParseError:
                raise  # Re-raise without adding to the stack
            except Exception as e:
                # Other exception, convert to XMLParseError
                raise XMLParseError(self, child) from e
                # NOTE to debug, use:
                # raise e

            # Add objects with IDs to the appropriate index
            self._add_to_index(result)

            # Store the parsed elements
            results[tag_name.lower()].extend(result)

        # Restore state
        self._clear_current(self._stack.pop())

        if unwrap:
            # Unwrap every value in results that is a length-1 list
            results = {k: v[0] if len(v) == 1 else v for k, v in
                       results.items()}

        return results

    def _add_to_index(self, items):
        """Add objects with IDs to the appropriate index."""
        for item in items:
            if isinstance(item, MaintainableArtefact) and not \
                    item.is_external_reference:
                # Global index for MaintainableArtefacts
                self._index[(item.__class__.__name__, item.id)] = item
            elif isinstance(item, IdentifiableArtefact):
                # Current scope index for IdentifiableArtefacts
                self._current[(item.__class__, item.id)] = item

    def _maintained(self, cls=None, id=None, urn=None, **kwargs):
        """Retrieve or instantiate a MaintainableArtefact of *cls* with *ids.

        If the object has been parsed (i.e. is in :attr:`_index`), it is
        returned; if not and `match_subclass` is :obj:`False`, it is
        instantiated with ``is_external_reference=True``, passing `kwargs`.

        If *urn* is given, it is used to determine *cls* and *id*, per the URN
        regular expression.
        """
        if urn:
            match = URN.match(urn).groupdict()
            cls = get_class(match['package'], match['class'])
            id = match['id']

            # Re-add the URN to the kwargs
            kwargs['urn'] = urn

        key = (cls.__name__, id) if isclass(cls) else (cls, id)

        # Maybe create a new object
        if key not in self._index:
            if not isclass(cls):
                raise TypeError(f'cannot instantiate from {cls!r}')
            elif not issubclass(cls, MaintainableArtefact):
                raise TypeError(f'{cls} is not maintainable')

            # A reference to a MaintainableArtefact that is not (yet) defined
            # in the current message is, necessarily, external, so finding
            # is_external_reference=False in the kwargs is a fatal error here.
            assert kwargs.setdefault('is_external_reference', True)

            # Create a new object and add to index
            self._index[key] = cls(id=id, **kwargs)

        # Existing or newly-created object
        return self._index[key]

    def _named(self, cls, elem, **kwargs):
        """Parse a NameableArtefact of *cls* from *elem*.

        NameableArtefacts may have .name and .description attributes that are
        InternationalStrings, plus zero or more Annotations. _named() handles
        these common elements, and returns an object and a _parse()'d dict of
        other, class-specific child values.

        Additional *kwargs* are used when parsing the children of *elem*.
        """
        # Apply conversions to attributes
        convert_attrs = {
            'agency_id': ('maintainer', lambda value: Agency(id=value)),
            'role': ('role', lambda value:
                     ConstraintRole(role=ConstraintRoleType[value])),
            }

        attr = {}
        for name, value in elem.attrib.items():
            # Name in snake case
            name = to_snake(name)
            # Optional new name and function to transform the value
            (name, xform) = convert_attrs.get(name, (name, lambda v: v))
            # Store transformed value
            attr[name] = xform(value)

        try:
            # Maybe retrieve an existing reference
            obj = self._maintained(cls, **attr)
            # Since the object is now being parsed, it's defined in the current
            # message and no longer an external reference
            obj.is_external_reference = False
        except TypeError:
            # Instantiate the class and store its attributes
            obj = cls(**attr)

        # Store object for parsing children
        self._current[(cls, obj.id)] = obj

        # Parse children
        values = self._parse(elem, **kwargs)

        # Store the name, description and annotations
        add_localizations(obj.name, values.pop('name'))
        add_localizations(obj.description, values.pop('description', []))
        obj.annotations = wrap(values.pop('annotations', []))

        # Return the instance and any non-name values
        return obj, values

    def _get_current(self, cls, id=None):
        """Return an instance of *cls* in the :attr:`_current` scope.

        *cls* may be a single class or tuple of classes valid as the
        `classinfo` argument of :func:`issubclass`. If `id` is given, the
        object must also have a matching ID.

        Raises RuntimeError if there are 0, or 2 or more instances.
        """
        results = []
        for k, obj in self._current.items():
            if issubclass(k[0], cls) and (id is None or id == k[1]):
                results.append(obj)

        if len(results) == 1:
            return results[0]
        elif len(results) > 1:  # pragma: no cover
            raise RuntimeError(f'cannot disambiguate multiple {cls.__name__} '
                               f'in the current scope: {results}')
        else:  # pragma: no cover
            raise RuntimeError(f'no {cls.__name__} in the current scope')

    def _clear_current(self, scope):
        """Clear references from self._current at the end of *scope*."""
        classes = {
            'CategoryScheme': (Category, CategoryScheme),
            'Categorisation': (Categorisation,),
            'Codelist': (Code,),
            'ConceptScheme': (Concept, ConceptScheme),
            'ContentConstraint': (ContentConstraint,),
            'Dataflow': (DataflowDefinition,),
            'DataSet': (DataStructureDefinition,),
            'DataStructure': (DataStructureDefinition,),
            }.get(scope, [])

        if len(classes) == 0:
            return

        for k in list(self._current.keys()):
            if k[0] in classes:
                self._current.pop(k)

    def _get_cc_dsd(self):
        """Return the DSD for the ContentConstraint in the current scope."""
        return list(self._get_current(ContentConstraint).content)[0].structure

    # Parsers for common elements

[docs]    def parse_international_string(self, elem):
        # Return a tuple (locale, text)
        return (elem.attrib.get(qname('xml', 'lang'), DEFAULT_LOCALE),
                elem.text)

[docs]    def parse_text(self, elem):
        return elem.text

[docs]    def parse_ref(self, elem, parent=None):
        """References to Identifiable- and MaintainableArtefacts.

        `parent` is the tag containing the reference.
        """
        # Unused attributes
        attr = copy(elem.attrib)
        attr.pop('agencyID', None)
        attr.pop('version', None)

        if elem.tag == 'URN':
            # Ref is a URN
            return self._maintained(urn=elem.text)

        # Every non-URN ref has an 'id' attribute
        ref_id = attr.pop('id')

        # Determine the class of the ref'd object
        try:
            # 'package' and 'class' attributes give the class directly
            cls = get_class(attr.pop('package'), attr.pop('class'))
        except KeyError:
            # No 'package' and 'class' attributes

            if parent == 'Parent':
                # Ref to parent of an Item in an ItemScheme; the ref'd object
                # has the same class as the Item
                cls = getattr(pandasdmx.model, self._stack[-1])
            elif parent in ('AttachmentGroup', 'Group'):
                cls = GroupDimensionDescriptor
            elif parent in ('Dimension', 'DimensionReference'):
                # References to Dimensions
                cls = [Dimension, TimeDimension]
            else:
                cls = getattr(pandasdmx.model, parent)

        # Get or instantiate the object itself
        try:
            # Some refs to IdentifiableArtefacts specify the parent
            # MaintainableArtefact

            # Attributes of the maintainable parent; this raises KeyError if
            # not present
            parent_attrs = dict(id=attr.pop('maintainableParentID'),
                                version=attr.pop('maintainableParentVersion'))
            assert len(attr) == 0

            # Class of the maintainable parent object
            parent_cls = {
                Category: CategoryScheme,
                Code: Codelist,
                Concept: ConceptScheme,
                DataProvider: DataProviderScheme,
                }[cls]

            # Retrieve or create the parent
            parent = self._maintained(parent_cls, **parent_attrs)

            # Retrieve or create the referenced object within the parent
            return parent.setdefault(id=ref_id, **attr)
        except KeyError:
            pass

        # Instantiate a new MaintainableArtefact
        try:
            return self._maintained(cls, id=ref_id)
        except TypeError:
            # 'cls' is not a MaintainableArtefact
            pass

        # Look up an existing IdentifiableArtefact in the current scope
        for cls in wrap(cls):
            try:
                return self._current[(cls, ref_id)]
            except KeyError:
                pass

        # Failed; probably a forward reference
        raise Reparse(0)

    # Parsers for elements appearing in data messages

[docs]    def parse_attributes(self, elem):
        result = {}

        ad = self._get_current(AttributeDescriptor)
        for e in elem.iterchildren():
            da = ad.get(e.attrib['id'])
            av = AttributeValue(value=e.attrib['value'], value_for=da)
            result[da.id] = av
        return result

[docs]    def parse_header(self, elem):
        # Collect values from *elem* and its children using XPath
        values = {}
        for key, xpath in HEADER_XPATH.items():
            matches = xpath(elem)
            if len(matches) == 0:
                continue
            values[key] = matches[0] if len(matches) == 1 else matches

        # Handle a reference to a DataStructureDefinition
        attrs = {}
        for k in ['id', 'agencyid', 'version', 'urn']:
            value = values.pop('structure_ref_' + k, None)
            if not value:
                continue
            elif k == 'agencyid':
                attrs['maintainer'] = Agency(id=value)
            else:
                attrs[k] = value

        if set(attrs.keys()) == {'urn'}:
            attrs['id'] = values['structure_id']

        extra = []

        if 'id' in attrs:
            # Create or retrieve the DSD. NB if the dsd argument was provided
            # to read_message(), this should be the same DSD
            dsd = self._maintained(DataStructureDefinition, **attrs)

            if 'structure_id' in values:
                # Add the DSD to the index a second time, using the message
                # -specific structure ID (rather that the DSD's own ID).
                key = ('DataStructureDefinition', values['structure_id'])
                self._index[key] = dsd

            # Create a DataflowDefinition
            dfd = DataflowDefinition(id=values.pop('structure_id'),
                                     structure=dsd)
            extra.append(dfd)

            # Store the observation at dimension level
            dim_at_obs = values.pop('dim_at_obs')
            if dim_at_obs == 'AllDimensions':
                self._obs_dim = AllDimensions
            else:
                # Retrieve or create the Dimension
                args = dict(id=dim_at_obs, order=1e9)
                if 'TimeSeries' in self._stack[0]:
                    # {,StructureSpecific}TimeSeriesData message → the
                    # dimension at observation level is a TimeDimension
                    args['cls'] = TimeDimension
                self._obs_dim = dsd.dimensions.get(**args)

        # Maybe return the DFD; see .initialize()
        return [Header(**values)] + extra

[docs]    def parse_message(self, elem):
        f = Footer(**elem.attrib)
        for locale, label in self._parse(elem)['text']:
            f.text.append(InternationalString(**{locale: label}))
        return f

[docs]    def parse_dataset(self, elem):
        # Attributes: structure reference to a DSD
        for attr in ['structureRef', qname('data', 'structureRef')]:
            if attr in elem.attrib:
                structure_ref = elem.attrib[attr]
                break

        # Create or retrieve (structure-specific message) the DSD
        dsd = self._maintained(DataStructureDefinition, structure_ref)
        # Add DSD contents to the indices for use in recursive parsing
        self._add_to_index(indexables_from_dsd(dsd))
        self._current[(DataStructureDefinition, None)] = dsd

        # DataSet class, e.g. GenericDataSet for root XML tag 'GenericData'
        DataSetClass = getattr(pandasdmx.model, f'{self._stack[0]}Set')

        # Create the object
        ds = DataSetClass(structured_by=dsd)

        values = self._parse(elem, unwrap=False)

        # Process groups
        ds.group = {g: [] for g in values.pop('group', [])}

        # Process series
        for series_key, obs_list in values.pop('series', []):
            # Add observations under this key
            ds.add_obs(obs_list, series_key)

        # Process bare observations
        ds.add_obs(values.pop('obs', []))

        assert len(values) == 0
        return ds

[docs]    def parse_group(self, elem):
        """<generic:Group>, <structure:Group>, or <Group>."""
        values = self._parse(elem)

        # Check which namespace this Group tag is part of
        if elem.tag == qname('gen', 'Group'):
            # generic → GroupKey in a DataMessage
            gk = values.pop('groupkey')
            gk.attrib.update(values.pop('attributes', {}))
            result = gk
        elif elem.tag == qname('str', 'Group'):
            # structure → GroupDimensionDescriptor
            gdd = GroupDimensionDescriptor(
                id=elem.attrib['id'],
                components=wrap(values.pop('groupdimension')))

            # Early update of the DSD so that later definitions in the DSD can
            # reference gdd
            dsd = self._get_current(DataStructureDefinition)
            dsd.group_dimensions[gdd.id] = gdd

            result = gdd
        else:
            # no namespace → GroupKey in a StructureSpecificData message
            dsd = self._get_current(DataStructureDefinition)

            # Pop the 'type' attribute
            args = copy(elem.attrib)
            group_id = args.pop(qname('xsi', 'type')).split(':')[-1]
            try:
                gdd = self._current[(GroupDimensionDescriptor, group_id)]
            except KeyError:
                # DSD not supplied when parsing a StructureSpecificMessage
                pass
            else:
                args['described_by'] = gdd

            result = GroupKey(**args, dsd=dsd)

        assert len(values) == 0
        return result

[docs]    def parse_key(self, elem):
        """SeriesKey, GroupKey, observation dimensions."""
        cls = {
            'GroupKey': GroupKey,
            'ObsKey': Key,
            'SeriesKey': SeriesKey,
            'Key': DataKey,  # for DataKeySet
            }[QName(elem).localname]
        if cls is not DataKey:
            # Most data: the value is specified as an XML attribute
            kv = {e.attrib['id']: e.attrib['value'] for e in
                  elem.iterchildren()}

            return cls(**kv, dsd=self._get_current(DataStructureDefinition))
        else:
            # <str:DataKeySet> and <str:CubeRegion>: the value(s) are specified
            # with a <com:Value>...</com:Value> element.
            kvs = {}
            for e in elem.iterchildren():
                c = self._get_cc_dsd().dimensions.get(e.attrib['id'])
                kvs[c] = ComponentValue(value_for=c,
                                        value=self._parse(e)['value'])
            return cls(included=elem.attrib.get('isIncluded', True),
                       key_value=kvs)

[docs]    def parse_obs(self, elem):
        values = self._parse(elem)

        dsd = self._get_current(DataStructureDefinition)

        # Attached attributes
        aa = values.pop('attributes', {})

        if 'obskey' in values:
            key = values.pop('obskey')
        elif 'obsdimension' in values:
            od = values.pop('obsdimension')
            dim = self._obs_dim.id
            if len(od) == 2:
                assert od['id'] == dim, (values, dim)
            key = Key(**{dim: od['value']}, dsd=dsd)

        if len(values):
            value = values.pop('obsvalue', None)
        else:
            # StructureSpecificData message—all information stored as XML
            # attributes of the <Observation>.
            attr = copy(elem.attrib)

            # Value of the observation
            value = attr.pop('OBS_VALUE', None)

            # Use the DSD to separate dimensions and attributes
            key = Key(**attr, dsd=dsd)

            # Remove attributes from the Key to be attached to the Observation
            aa.update(key.attrib)
            key.attrib = {}

        assert len(values) == 0, values
        return Observation(dimension=key, value=value, attached_attribute=aa)

[docs]    def parse_obsdimension(self, elem):
        assert set(elem.attrib.keys()) <= {'id', 'value'}
        return copy(elem.attrib)

[docs]    def parse_obsvalue(self, elem):
        assert len(elem.attrib) == 1, elem.attrib
        return elem.attrib['value']

[docs]    def parse_series(self, elem):
        values = self._parse(elem)
        try:
            series_key = values.pop('serieskey')
            series_key.attrib.update(values.pop('attributes', {}))
        except KeyError:
            # StructureSpecificData message
            dsd = self._get_current(DataStructureDefinition)
            series_key = SeriesKey(**elem.attrib, dsd=dsd)
        obs_list = wrap(values.pop('obs', []))
        for o in obs_list:
            o.series_key = series_key
        assert len(values) == 0
        return (series_key, obs_list)

    # Parsers for elements appearing in structure messages

[docs]    def parse_structures(self, elem):
        return self._parse(elem, unwrap=False)

[docs]    def parse_organisation(self, elem):
        cls = getattr(pandasdmx.model, QName(elem).localname)
        o, values = self._named(cls, elem)
        o.contact = wrap(values.pop('contact', []))
        assert len(values) == 0
        return o

[docs]    def parse_contact(self, elem):
        values = self._parse(elem, unwrap=False)
        # Map XML element names to the class attributes in the SDMX-IM spec
        values['name'] = values.pop('name')[0]
        values['telephone'] = values.pop('telephone', [None])[0]
        values['org_unit'] = values.pop('department', [{}])[0]
        values['responsibility'] = values.pop('role', [{}])[0]
        return Contact(**values)

[docs]    def parse_annotation(self, elem):
        values = self._parse(elem)

        # Rename values from child elements: 'annotationurl' → 'url'
        for tag in ('text', 'title', 'type', 'url'):
            try:
                values[tag] = values.pop('annotation' + tag)
            except KeyError:
                pass

        # Optional 'id' attribute
        try:
            values['id'] = elem.attrib['id']
        except KeyError:
            pass

        return Annotation(**values)

[docs]    def parse_code(self, elem):
        c, values = self._named(Code, elem)
        try:
            c.parent = values.pop('parent')
            c.parent.child.append(c)
        except KeyError:
            pass
        assert len(values) == 0, values
        return c

[docs]    def parse_categorisation(self, elem):
        c, values = self._named(Categorisation, elem)
        c.artefact = values.pop('source')
        c.category = values.pop('target')
        assert len(values) == 0
        return c

[docs]    def parse_category(self, elem):
        c, values = self._named(Category, elem, unwrap=False)
        for child_category in values.pop('category', []):
            c.child.append(child_category)
            child_category.parent = c
        assert len(values) == 0
        return c

[docs]    def parse_categoryscheme(self, elem):
        cs, values = self._named(CategoryScheme, elem)
        cs.extend(values.pop('category', []))
        assert len(values) == 0
        return cs

[docs]    def parse_codelist(self, elem):
        cl, values = self._named(Codelist, elem, unwrap=False)
        cl.extend(values.pop('code', []))
        assert len(values) == 0
        return cl

[docs]    def parse_concept(self, elem):
        c, values = self._named(Concept, elem)
        c.core_representation = values.pop('corerepresentation', None)
        try:
            c.parent = values.pop('parent')
        except KeyError:
            pass
        assert len(values) == 0
        return c

[docs]    def parse_conceptidentity(self, elem):
        # <ConceptIdentity> element can contain a child <URN>. Unlike other
        # URNs, this references a non-maintainable class (Concept), rather than
        # its maintainable parent (ConceptScheme); so parse_ref fails.

        # Parse children, which should only be a <URN>
        values = self._parse(elem)
        if set(values.keys()) != {'urn'}:
            raise ValueError(values)

        # URN should refer to a Concept
        match = URN.match(values['urn']).groupdict()
        if match['class'] != 'Concept':
            raise ValueError(values['urn'])

        # Look up the parent ConceptScheme
        cls = get_class(match['package'], 'ConceptScheme')
        cs = self._maintained(cls=cls, id=match['id'])

        # Get or create the Concept within *cs*
        return cs.setdefault(id=match['item_id'])

[docs]    def parse_constraintattachment(self, elem):
        constrainables = self._parse(elem)
        assert len(constrainables) == 1
        result = list(constrainables.values())[0]

        # Also add to the parent ContentConstraint for use in parsing KeyValues
        self._get_current(ContentConstraint).content.add(result)

        return result

[docs]    def parse_orgscheme(self, elem):
        cls = getattr(pandasdmx.model, QName(elem).localname)
        os, values = self._named(cls, elem, unwrap=False)
        # Get the list of organisations. The following assumes that the
        # *values* dict has only one item. Otherwise, the returned item will be
        # unpredictable.
        # TODO review the code parsing the children to verify that the
        #      assumption always holds.
        _, orgs = values.popitem()
        os.extend(orgs)
        return os

[docs]    def parse_conceptscheme(self, elem):
        cs, values = self._named(ConceptScheme, elem, unwrap=False)
        cs.extend(values.pop('concept', []))
        assert len(values) == 0
        return cs

[docs]    def parse_dataflow(self, elem):
        dfd, values = self._named(DataflowDefinition, elem)
        dfd.structure = values.pop('structure')
        assert len(values) == 0
        return dfd

[docs]    def parse_datastructure(self, elem):
        dsd, values = self._named(DataStructureDefinition, elem)
        target = {
            DimensionDescriptor: 'dimensions',
            AttributeDescriptor: 'attributes',
            MeasureDescriptor: 'measures',
            GroupDimensionDescriptor: 'group_dimensions',
            }
        for c in values.pop('datastructurecomponents'):
            attr = target[type(c)]

            if attr == 'group_dimensions':
                # These are already added 'eagerly', by parse_group
                continue

            setattr(dsd, attr, c)

        assert len(values) == 0
        return dsd

[docs]    def parse_componentlist(self, elem):
        attr = copy(elem.attrib)

        # Determine the class
        try:
            cls_name = attr.pop('id')
        except KeyError:
            # SDMX-ML spec for, e.g. DimensionList: "The id attribute is
            # provided in this case for completeness. However, its value is
            # fixed to 'DimensionDescriptor'."
            cls_name = QName(elem).localname.replace('List', 'Descriptor')
        finally:
            ComponentListClass = getattr(pandasdmx.model, cls_name)

        cl = ComponentListClass(
            components=list(chain(*self._parse(elem, unwrap=False).values())),
            **attr,
        )

        try:
            cl.assign_order()
        except AttributeError:
            pass

        return cl

[docs]    def parse_dimension(self, elem):
        values = self._parse(elem)

        # Object class: Dimension, MeasureDimension, or TimeDimension
        DimensionClass = getattr(pandasdmx.model, QName(elem).localname)

        args = copy(elem.attrib)
        try:
            args['order'] = int(args.pop('position'))
        except KeyError:
            pass

        args.update(dict(
            concept_identity=values.pop('conceptidentity'),
            local_representation=values.pop('localrepresentation', None),
        ))
        assert len(values) == 0, values

        return DimensionClass(**args)

[docs]    def parse_groupdimension(self, elem):
        values = self._parse(elem)
        d = values.pop('dimensionreference')
        assert len(values) == 0
        return d

[docs]    def parse_attribute(self, elem):
        if self._stack[-1] == 'CubeRegion':
            # <com:Attribute> inside a CubeRegion is a MemberSelection
            return self.parse_memberselection(elem)

        args = dict(id=elem.attrib['id'])
        try:
            args['urn'] = elem.attrib['urn']
        except KeyError:
            pass

        try:
            us = elem.attrib['assignmentStatus']
        except KeyError:
            pass
        else:
            args['usage_status'] = UsageStatus[us.lower()]

        values = self._parse(elem)
        args.update(dict(
            concept_identity=values.pop('conceptidentity'),
            local_representation=values.pop('localrepresentation', None),
            related_to=values.pop('attributerelationship'),
        ))
        assert len(values) == 0

        return DataAttribute(**args)

[docs]    def parse_primarymeasure(self, elem):
        values = self._parse(elem)
        pm = PrimaryMeasure(
            concept_identity=values.pop('conceptidentity'),
            local_representation=values.pop('localrepresentation', None),
            **elem.attrib,
            )
        assert len(values) == 0
        return pm

[docs]    def parse_attributerelationship(self, elem):
        # Child element names
        tags = set([QName(e).localname for e in elem.iterchildren()])

        if 'PrimaryMeasure' not in tags:
            # Avoid recurive _parse() here, because it may contain a Ref to
            # a PrimaryMeasure that is not yet defined
            values = self._parse(elem, unwrap=False)
        else:
            values = []

        args = {}
        try:
            tags.remove('AttachmentGroup')
        except KeyError:
            pass
        else:
            args['group_key'] = values.pop('attachmentgroup')[0]

        tag = tags.pop()
        assert len(tags) == 0, tags

        cls = {
            'Dimension': DimensionRelationship,
            'PrimaryMeasure': PrimaryMeasureRelationship,
            'None': NoSpecifiedRelationship,
            'Group': DimensionRelationship,
        }[tag]

        if tag == 'Dimension':
            args['dimensions'] = values.pop('dimension')
        elif tag == 'Group':
            # Reference to a GroupDimensionDescriptor
            args['group_key'] = values.pop('group')[0]
        elif tag == 'None':
            values.pop('none')
        assert len(values) == 0, values

        return cls(**args)

[docs]    def parse_representation(self, elem):
        r = Representation()
        values = self._parse(elem, unwrap=False)
        if 'enumeration' in values:
            for e in values.pop('enumeration'):
                if isinstance(e, str):
                    e = ItemScheme(urn=e)
                r.enumerated = e
            if 'enumerationformat' in values:
                r.non_enumerated = values.pop('enumerationformat')
        elif 'textformat' in values:
            r.non_enumerated = values.pop('textformat')

        assert len(values) == 0
        return r

[docs]    def parse_facet(self, elem):
        # Parse facet value type; SDMX-ML default is 'String'
        fvt = elem.attrib.get('textType', 'String')
        # Convert case of the value. In XML, first letter is uppercase; in
        # the spec and Python enum, lowercase.
        f = Facet(value_type=FacetValueType[fvt[0].lower() + fvt[1:]])

        # Other attributes are for Facet.type, an instance of FacetType
        for key, value in elem.attrib.items():
            if key == 'textType':
                continue
            # Convert attribute name from camelCase to snake_case
            setattr(f.type, to_snake(key), value)

        return f

    # Parsers for constraints etc.
[docs]    def parse_contentconstraint(self, elem):
        role = elem.attrib.pop('type').lower()
        elem.attrib['role'] = 'allowable' if role == 'allowed' else role
        cc, values = self._named(ContentConstraint, elem)
        cc.content.update(wrap(values.pop('constraintattachment')))
        cc.data_content_region.append(values.pop('cuberegion', None))
        cc.data_content_keys = values.pop('datakeyset', None)
        assert len(values) == 0, values
        return cc

[docs]    def parse_cuberegion(self, elem):
        values = self._parse(elem, unwrap=False)
        cr = CubeRegion(included=elem.attrib['include'])

        # Combine member selections for Dimensions and Attributes
        for ms in values.pop('keyvalue', []) + values.pop('attribute', []):
            cr.member[ms.values_for] = ms

        assert len(values) == 0
        return cr

[docs]    def parse_memberselection(self, elem):
        """<com:KeyValue> (not inside <com:Key>); or <com:Attribute>."""
        values = self._parse(elem)
        values = list(map(lambda v: MemberValue(value=v), values['value']))

        # Values are for either a Dimension or Attribute, based on tag name
        kind = {
            'KeyValue': ('dimensions', Dimension),
            'Attribute': ('attributes', DataAttribute),
        }.get(QName(elem).localname)

        try:
            # Navigate from the current ContentConstraint to a
            # ConstrainableArtefact. If this is a DataFlow, it has a DSD.
            dsd = self._get_cc_dsd()
        except AttributeError:
            # Failed because the ContentConstraint is attached to something,
            # e.g. DataProvider, that does not provide an association to a DSD.
            # Try to get a Component from the current scope with matching ID.
            component = self._get_current(cls=kind[1], id=elem.attrib['id'])
        else:
            # Get the Component from the correct list according to the kind
            component = getattr(dsd, kind[0]).get(elem.attrib['id'])

        return MemberSelection(values=values, values_for=component)

[docs]    def parse_datakeyset(self, elem):
        values = self._parse(elem)
        dks = DataKeySet(included=elem.attrib.pop('isIncluded'),
                         keys=values.pop('key'))
        assert len(values) == 0
        return dks

[docs]    def parse_provisionagreement(self, elem):
        pa, values = self._named(ProvisionAgreement, elem)
        pa.structure_usage = values.pop('structureusage')
        pa.data_provider = values.pop('dataprovider')
        assert len(values) == 0, values
        return pa

    # Parsers for elements appearing in error messages

[docs]    def parse_errormessage(self, elem):
        values = self._parse(elem)
        values['text'] = [InternationalString(values['text'])]
        values['code'] = elem.attrib['code']
        return values


def indexables_from_dsd(dsd):
    """Return indexable items from a DSD."""
    # AttributeDescriptor and DataAttributes
    yield dsd.attributes
    yield from dsd.attributes.components

    # DimensionDescriptor and *Dimensions
    yield dsd.dimensions
    yield from dsd.dimensions.components

    if dsd.measures:
        yield dsd.measures
        yield from dsd.measures.components

    for gdd in dsd.group_dimensions.values():
        yield gdd
        yield from gdd.components
Source code for pandasdmx.reader.sdmxml

pandaSDMX

Navigation

Related Topics