Source code for csvw.datatypes

"""
We model the hierarchy of basic datatypes using a class hierarchy.

`Derived datatypes <https://www.w3.org/TR/tabular-metadata/#derived-datatypes>`_ are implemented
via :class:`csvw.Datatype` which is
`composed of <https://realpython.com/inheritance-composition-python/#whats-composition>`_
a basic datatype and additional behaviour.

.. seealso:: http://w3c.github.io/csvw/metadata/#datatypes
"""
import re
import json as _json
import math
import base64
import typing
import decimal as _decimal
import binascii
import datetime
import warnings
import itertools
import collections

import isodate
import rfc3986
import dateutil.parser
import babel.numbers
import babel.dates
import jsonschema

if typing.TYPE_CHECKING:  # pragma: no cover
    import csvw

__all__ = ['DATATYPES']

DATATYPES = {}


def register(cls):
    DATATYPES[cls.name] = cls
    return cls


def to_binary(s, encoding='utf-8'):
    if not isinstance(s, bytes):
        return bytes(s, encoding=encoding)
    return s  # pragma: no cover


[docs]@register class anyAtomicType: """ A basic datatype consists of - a bag of attributes, most importantly a `name` which matches the name or alias of one of the \ `CSVW built-in datatypes <https://www.w3.org/TR/tabular-metadata/#built-in-datatypes>`_ - three staticmethods controlling marshalling and unmarshalling of Python objects to strings. Theses methods are orchestrated from :class:`csvw.Datatype` in its `read` and `formatted` methods. """ name = 'any' minmax = False example = 'x' @classmethod def value_error(cls, v): raise ValueError('invalid lexical value for {}: {}'.format(cls.name, v)) def __str__(self) -> str: return self.name @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: return {} @staticmethod def to_python(v: str, **kw) -> object: return v # pragma: no cover @staticmethod def to_string(v: object, **kw) -> str: return '{}'.format(v)
[docs]@register class string(anyAtomicType): """ Maps to `str`. The lexical and value spaces of xs:string are the set of all possible strings composed of any character allowed in a XML 1.0 document without any treatment done on whitespaces. """ name = 'string' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: if datatype.format: # We wrap a regex specified as `format` property into a group and add `$` to # make sure the whole string is matched when validating. try: return {'regex': re.compile(r'({})$'.format(datatype.format))} except re.error: warnings.warn('Invalid regex pattern as datatype format') return {} @staticmethod def to_python(v, regex=None): if regex and not regex.match(v): string.value_error(v) return v
[docs]@register class anyURI(string): """ Maps to `rfc3986.URIReference`. This datatype corresponds normatively to the XLink href attribute. Its value space includes the URIs defined by the RFCs `2396 <https://datatracker.ietf.org/doc/html/rfc2396>`_ and `2732 <https://datatracker.ietf.org/doc/html/rfc2732>`_, but its lexical space doesn’t require the character escapes needed to include non-ASCII characters in URIs. .. note:: We normalize URLs according to the rules in `RFC 3986 <https://datatracker.ietf.org/doc/html/rfc3986#section-6.2>`_ when serializing to `str`. Thus roundtripping isn't guaranteed. """ name = 'anyURI' @staticmethod def to_python(v, regex=None): res = string.to_python(v, regex=regex) return rfc3986.URIReference.from_string(res.encode('utf-8')) @staticmethod def to_string(v, **kw): if hasattr(v, 'geturl'): # Presumably a `urllib.parse.ParseResult`. return v.geturl() if hasattr(v, 'unsplit'): # Presumable a `rfc3986.URIReference` return v.unsplit() assert isinstance(v, str) return rfc3986.normalize_uri(v)
[docs]@register class NMTOKEN(string): """ Maps to `str` The lexical and value spaces of xs:NMTOKEN are the set of XML 1.0 “name tokens,” i.e., tokens composed of characters, digits, “.”, “:”, “-”, and the characters defined by Unicode, such as “combining” or “extender”. This type is usually called a “token.” Valid values include "Snoopy", "CMS", "1950-10-04", or "0836217462". Invalid values include "brought classical music to the Peanuts strip" (spaces are forbidden) or "bold,brash" (commas are forbidden). """ name = "NMTOKEN" @staticmethod def to_python(v, regex=None): v = string.to_python(v, regex=regex) if not re.fullmatch(r'[\w.:-]*', v): NMTOKEN.value_error(v) return v
[docs]@register class base64Binary(anyAtomicType): """ Maps to `bytes` """ name = 'base64Binary' example = 'YWJj' @staticmethod def to_python(v, **kw): try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: base64Binary.value_error(v[:10]) try: res = base64.decodebytes(res) except Exception: raise ValueError('invalid base64 encoding') return res @staticmethod def to_string(v, **kw): return base64.encodebytes(v).decode().strip()
@register class _binary(base64Binary): """ Maps to `bytes`. Alias for :class:`base64Binary` """ name = 'binary'
[docs]@register class hexBinary(anyAtomicType): """ Maps to `bytes`. .. note:: We normalize to uppercase hex digits when seriializing to `str`. Thus, roundtripping is limited. """ name = 'hexBinary' example = 'ab' @staticmethod def to_python(v, **kw): try: res = to_binary(v, encoding='ascii') except UnicodeEncodeError: hexBinary.value_error(v[:10]) try: res = binascii.unhexlify(res) except (binascii.Error, TypeError): raise ValueError('invalid hexBinary encoding') return res @staticmethod def to_string(v, **kw): return binascii.hexlify(v).decode().upper()
[docs]@register class boolean(anyAtomicType): """ Maps to `bool`. .. code-block:: python >>> from csvw import Datatype >>> dt = Datatype.fromvalue({"base": "boolean", "format": "Yea|Nay"}) >>> dt.read('Nay') False >>> dt.formatted(True) 'Yea' .. seealso:: `<https://www.w3.org/TR/tabular-data-model/#formats-for-booleans>`_ """ name = 'boolean' example = 'false' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: if datatype.format and isinstance(datatype.format, str) and datatype.format.count('|') == 1: true, false = [[v] for v in datatype.format.split('|')] else: if datatype.format and ( (not isinstance(datatype.format, str)) or (datatype.format.count('|') != 1)): warnings.warn('Invalid format spec for boolean') true, false = ['true', '1'], ['false', '0'] return {'true': true, 'false': false} @staticmethod def to_python(s, true=('true', '1'), false=('false', '0')): if isinstance(s, bool) or s is None: return s if s in true: return True if s in false: return False raise boolean.value_error(s) @staticmethod def to_string(v, true=('true', '1'), false=('false', '0')): return (true if v else false)[0]
def with_tz(v, func, args, kw): tz_pattern = re.compile('(Z|[+-][0-2][0-9]:[0-5][0-9])$') tz = tz_pattern.search(v) if tz: v = v[:tz.start()] tz = tz.groups()[0] res = func(v, *args, **kw) if tz: dt = dateutil.parser.parse('{}{}'.format(datetime.datetime.now(), tz)) res = datetime.datetime( res.year, res.month, res.day, res.hour, res.minute, res.second, res.microsecond, dt.tzinfo) return res
[docs]@register class dateTime(anyAtomicType): """ Maps to `datetime.datetime`. """ name = 'datetime' minmax = True example = '2018-12-10T20:20:20' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: return dt_format_and_regex(datatype.format) @staticmethod def _parse(v, cls, regex, tz_marker=None): v = v.strip() try: comps = regex.match(v).groupdict() except AttributeError: # pragma: no cover dateTime.value_error(v) if comps.get('extramicroseconds'): raise ValueError('Extra microseconds') if comps.get('microsecond'): # We have to convert decimal fractions of seconds to microseconds. # This is done by first chopping off anything under 6 decimal places, # then (in case we got less precision) right-padding with 0 to get a # 6-digit number. comps['microsecond'] = comps['microsecond'][:6].ljust(6, '0') if cls == datetime.datetime and 'year' not in comps: d = datetime.date.today() for a in ['year', 'month', 'day']: comps[a] = getattr(d, a) res = cls(**{k: int(v) for k, v in comps.items() if v is not None}) if tz_marker: # Let dateutils take care of parsing the timezone info: res = res.replace(tzinfo=dateutil.parser.parse(v).tzinfo) return res @staticmethod def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): if pattern and regex: match = regex.match(v) if not match: raise ValueError('{} -- {} -- {}'.format(pattern, v, regex)) # pragma: try: return dateutil.parser.isoparse(v) except ValueError: return dateTime._parse(v, datetime.datetime, regex, tz_marker=tz_marker) @staticmethod def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): if pattern: return babel.dates.format_datetime(v, tzinfo=v.tzinfo, format=pattern) return v.isoformat()
@register class _dateTime(dateTime): """ Maps to `datetime.datetime`. Alias for :class:`dateTime` """ name = 'dateTime'
[docs]@register class date(dateTime): """ Maps to `datetime.datetime` (in order to be able to preserve timezone information). """ name = 'date' example = '2018-12-10' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: try: return dt_format_and_regex(datatype.format or 'yyyy-MM-dd') except ValueError: warnings.warn('Invalid date format') return dt_format_and_regex('yyyy-MM-dd') @staticmethod def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): return with_tz( v.strip(), dateTime.to_python, [], dict(regex=regex, fmt=fmt, pattern=pattern)) @staticmethod def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): from babel.dates import format_date if pattern: return format_date(v, format=pattern, locale='en') return dateTime.to_string(v, regex=regex, fmt=fmt, tz_marker=tz_marker, pattern=pattern)
[docs]@register class dateTimeStamp(dateTime): """ Maps to `datetime.datetime`. """ name = 'dateTimeStamp' example = '2018-12-10T20:20:20' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: res = dt_format_and_regex(datatype.format or 'yyyy-MM-ddTHH:mm:ss.SSSSSSXXX') if not res['tz_marker']: raise ValueError('dateTimeStamp must have timezone marker') return res
[docs]@register class _time(dateTime): """ Maps to `datetime.datetime` (in order to be able to preserve timezone information). """ name = 'time' example = '20:20:20' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: return dt_format_and_regex(datatype.format or 'HH:mm:ss', no_date=True) @staticmethod def to_python(v, regex=None, fmt=None, tz_marker=None, pattern=None): if pattern and 'x' in pattern.lower(): return dateutil.parser.parse('{}T{}'.format(datetime.date.today().isoformat(), v)) assert regex is not None return with_tz(v, dateTime._parse, [datetime.datetime, regex], dict(tz_marker=tz_marker)) @staticmethod def to_string(v, regex=None, fmt=None, tz_marker=None, pattern=None): return babel.dates.format_time(v, tzinfo=v.tzinfo, format=pattern)
[docs]@register class duration(anyAtomicType): """ Maps to `datetime.timedelta`. .. code-block: python >>> from csvw import Datatype >>> dt = Datatype.fromvalue("datetime") >>> duration = Datatype.fromvalue("duration") >>> dt.formatted(dt.read("2022-06-24T12:00:00") + duration.read("P1MT2H")) '2022-07-24T14:00:00' """ name = 'duration' example = 'P3Y6M4DT12H30M5S' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: return {'format': datatype.format} @staticmethod def to_python(v, format=None, **kw): if format and not re.match(format, v): raise ValueError return isodate.parse_duration(v) @staticmethod def to_string(v, format=None, **kw): return isodate.duration_isoformat(v)
[docs]@register class dayTimeDuration(duration): """ Maps to `datetime.timedelta`. """ name = 'dayTimeDuration'
[docs]@register class yearMonthDuration(duration): """ Maps to `datetime.timedelta`. """ name = 'yearMonthDuration'
[docs]@register class decimal(anyAtomicType): """ Maps to `decimal.Decimal`. xs:decimal is the datatype that represents the set of all the decimal numbers with arbitrary lengths. Its lexical space allows any number of insignificant leading and trailing zeros (after the decimal point). There is no support for scientific notations. Valid values include: "123.456", "+1234.456", "-1234.456", "-.456", or "-456". The following values would be invalid: [...] "1234.456E+2" (scientific notation ("E+2") is forbidden). XML-Schema restricts the lexical space by disallowing "thousand separator" and forcing the decimal separator to be ".". But these limitations can be overcome within CSVW using a `derived datatype <https://www.w3.org/TR/tabular-data-model/#formats-for-numeric-types>`_: .. code-block:: python >>> from csvw import Datatype >>> dt = Datatype.fromvalue( ... {"base": "decimal", "format": {"groupChar": ".", "decimalChar": ","}}) >>> dt.read("1.234,5") Decimal('1234.5') .. note:: While mapping to `decimal.Decimal` rather than `float` makes handling of the Python object somewhat cumbersome, it makes sure we can roundtrip values correctly. """ name = 'decimal' minmax = True example = '5' _special = { 'INF': 'Infinity', '-INF': '-Infinity', 'NaN': 'NaN', } _reverse_special = {v: k for k, v in _special.items()} @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: if datatype.format: return datatype.format if isinstance(datatype.format, dict) \ else {'pattern': datatype.format} return {} @staticmethod def to_python(v, pattern=None, decimalChar=None, groupChar=None): if isinstance(v, str) and 'e' in v.lower(): raise ValueError('Invalid value for decimal') if isinstance(v, str) and re.search('{0}{0}+'.format(re.escape(groupChar or ',')), v): raise ValueError('Invalid value for decimal') if groupChar is None and pattern and ',' in pattern: groupChar = ',' if decimalChar is None and pattern and '.' in pattern: decimalChar = '.' if pattern and not NumberPattern(pattern).is_valid( v.replace(groupChar or ',', ',').replace(decimalChar or '.', '.')): raise ValueError( 'Invalid value "{}" for decimal with pattern "{}"'.format(v, pattern)) factor = 1 if isinstance(v, str): if v in decimal._special: warnings.warn('Invalid special value for decimal') return _decimal.Decimal(decimal._special[v]) if groupChar: v = v.replace(groupChar, '') if decimalChar and decimalChar != '.': v = v.replace(decimalChar, '.') for c, factor in [('%', _decimal.Decimal('0.01')), ('‰', _decimal.Decimal('0.001'))]: if c in v: v = v.replace(c, '') break else: factor = 1 try: return _decimal.Decimal(v) * factor except (TypeError, _decimal.InvalidOperation): decimal.value_error(v) @staticmethod def to_string(v, pattern=None, decimalChar=None, groupChar=None): if '{}'.format(v) in decimal._reverse_special: return decimal._reverse_special['{}'.format(v)] if pattern: v = babel.numbers.format_decimal(v, pattern, 'en') if decimalChar: v = v.replace('.', decimalChar) if groupChar: v = v.replace(',', groupChar) return v fmt = '{}' if groupChar is None else '{:,}' try: neg = v < 0 except TypeError: neg = None v = fmt.format(v) if 'e' in v.lower(): # detect scientific notation digits, exp = v.lower().split('e') digits = digits.replace('.', '').replace('-', '') exp = int(exp) zero_padding = '0' * (abs(int(exp)) - 1) sign = '-' if neg else '' return '{}{}{}.0'.format(sign, digits, zero_padding) if exp > 0 else ( '{}0.{}{}'.format(sign, zero_padding, digits)) if groupChar or decimalChar: def repl(m): if m.group('c') == ',': return groupChar if m.group('c') == '.': return decimalChar r = '(?P<c>[{}])'.format(re.escape((decimalChar or '') + (groupChar or ''))) v = re.sub(r, repl, v) return v
[docs]@register class integer(decimal): """ Maps to `int`. """ name = 'integer' range = None @classmethod def to_python(cls, v, **kw): res = decimal.to_python(v, **kw) numerator, denominator = res.as_integer_ratio() if denominator == 1: if cls.range and not (cls.range[0] <= numerator <= cls.range[1]): raise ValueError("{} must be an integer between {} and {}, but got ".format( cls.name, cls.range[0], cls.range[1]), v) return numerator raise ValueError('Invalid value for integer')
@register class _int(integer): """ Maps to `int`. Alias for :class:`integer`. """ name = 'int'
[docs]@register class unsignedInt(integer): """ Maps to `int`. The value space of xs:unsignedInt is the integers between 0 and 4294967295, i.e., the unsigned values that can fit in a word of 32 bits. Its lexical space allows an optional “+” sign and leading zeros before the significant digits. The decimal point (even when followed only by insignificant zeros) is forbidden. Valid values include "4294967295", "0", "+0000000000000000000005", or "1". Invalid values include "-1" and "1.". """ name = 'unsignedInt' range = (0, 4294967295)
[docs]@register class unsignedShort(integer): """ Maps to `int`. The value space of xs:unsignedShort is the integers between 0 and 65535, i.e., the unsigned values that can fit in a word of 16 bits. Its lexical space allows an optional “+” sign and leading zeros before the significant digits. The decimal point (even when followed only by insignificant zeros) is forbidden. Valid values include "65535", "0", "+0000000000000000000005", or "1". Invalid values include "-1" and "1." . """ name = 'unsignedShort' range = (0, 65535)
[docs]@register class unsignedLong(integer): """ Maps to `int`. The value space of xs:unsignedLong is the integers between 0 and 18446744073709551615, i.e., the unsigned values that can fit in a word of 64 bits. Its lexical space allows an optional “+” sign and leading zeros before the significant digits. The decimal point (even when followed only by insignificant zeros) is forbidden. Valid values include "18446744073709551615", "0", "+0000000000000000000005", or "1". Invalid values include "-1" and "1.". """ name = 'unsignedLong' range = (0, 18446744073709551615)
[docs]@register class unsignedByte(integer): """ Maps to `int`. The value space of xs:unsignedByte is the integers between 0 and 255, i.e., the unsigned values that can fit in a word of 8 bits. Its lexical space allows an optional “+” sign and leading zeros before the significant digits. The lexical space does not allow values expressed in other numeration bases (such as hexadecimal, octal, or binary). The decimal point (even when followed only by insignificant zeros) is forbidden. Valid values include "255", "0", "+0000000000000000000005", or "1". Invalid values include "-1" and "1.". """ name = 'unsignedByte' range = (0, 255)
[docs]@register class short(integer): """ Maps to `int`. The value space of xs:short is the set of common short integers (16 bits), i.e., the integers between -32768 and 32767; its lexical space allows any number of insignificant leading zeros. The decimal point (even when followed only by insignificant zeros) is forbidden. Valid values include "-32768", "0", "-0000000000000000000005", or "32767". Invalid values include "32768" and "1.". """ name = 'short' range = (-32768, 32767)
[docs]@register class long(integer): """ Maps to `int`. The value space of xs:long is the set of common double-size integers (64 bits), i.e., the integers between -9223372036854775808 and 9223372036854775807; its lexical space allows any number of insignificant leading zeros. The decimal point (even when followed only by insignificant zeros) is forbidden. Valid values for xs:long include "-9223372036854775808", "0", "-0000000000000000000005", or "9223372036854775807". Invalid values include "9223372036854775808" and "1.". """ name = 'long' range = (-9223372036854775808, 9223372036854775807)
[docs]@register class byte(integer): """ Maps to `int`. The value space of xs:byte is the integers between -128 and 127, i.e., the signed values that can fit in a word of 8 bits. Its lexical space allows an optional sign and leading zeros before the significant digits. The lexical space does not allow values expressed in other numeration bases (such as hexadecimal, octal, or binary). Valid values for byte include 27, -34, +105, and 0. Invalid values include 0A, 1524, and INF. """ name = 'byte' range = (-128, 127)
[docs]@register class nonNegativeInteger(integer): """ Maps to `int`. """ name = 'nonNegativeInteger' range = (0, math.inf)
[docs]@register class positiveInteger(integer): """ Maps to `int`. """ name = 'positiveInteger' range = (1, math.inf)
[docs]@register class nonPositiveInteger(integer): """ Maps to `int`. """ name = 'nonPositiveInteger' example = '-5' range = (-math.inf, 0)
[docs]@register class negativeInteger(integer): """ Maps to `int`. """ name = 'negativeInteger' example = '-5' range = (-math.inf, -1)
[docs]@register class _float(anyAtomicType): """ Maps to `float`. .. note:: Due to the well known issues with representing floating point numbers, roundtripping may not work correctly. .. seealso:: `<https://docs.python.org/3/tutorial/floatingpoint.html>`_ """ name = 'float' minmax = True example = '5.3' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: if datatype.format: return datatype.format if isinstance(datatype.format, dict) \ else {'pattern': datatype.format} return {} @staticmethod def to_python(v, pattern=None, **kw): if pattern and not NumberPattern(pattern).is_valid(v): raise ValueError( 'Invalid value "{}" for number with pattern "{}"'.format(v, pattern)) try: return float(v) except (TypeError, ValueError): _float.value_error(v) @staticmethod def to_string(v, **kw): return '{}'.format(v)
[docs]@register class number(_float): """ Maps to `float`. """ name = 'number'
[docs]@register class double(_float): """ Maps to `float`. """ name = 'double'
[docs]@register class normalizedString(string): """ Maps to `str`. The lexical space of xs:normalizedString is unconstrained (any valid XML character may be used), and its value space is the set of strings after whitespace replacement (i.e., after any occurrence of #x9 (tab), #xA (linefeed), and #xD (carriage return) have been replaced by an occurrence of #x20 (space) without any whitespace collapsing). .. note:: The CSVW test suite (specifically in `test036 <https://w3c.github.io/csvw/tests/#test036>`_ and `test037 <https://w3c.github.io/csvw/tests/#test037>`_) requires that `normalizedString` is also trimmed, i.e. stripped from leading and trailing whitespace. So that's we do. """ name = 'normalizedString' @staticmethod def to_python(v, regex=None): if v: for c in '\r\n\t': v = v.replace(c, ' ') v = v.strip() return string.to_python(v, '')
[docs]@register class QName(string): """ Maps to `str`. """ name = 'QName'
[docs]@register class gDay(string): """ Maps to `str`. """ name = 'gDay'
[docs]@register class gMonth(string): """ Maps to `str`. """ name = 'gMonth'
[docs]@register class gMonthDay(string): """ Maps to `str`. """ name = 'gMonthDay'
[docs]@register class gYear(string): """ Maps to `str`. """ name = 'gYear'
[docs]@register class gYearMonth(string): """ Maps to `str`. """ name = 'gYearMonth'
[docs]@register class xml(string): """ Maps to `str`. """ name = 'xml'
[docs]@register class html(string): """ Maps to `str`. """ name = 'html'
[docs]@register class json(string): """ Maps to `str`, `list` or `dict`, i.e. to the result of `json.loads`. .. code-block:: python >>> from csvw import Datatype >>> dt = Datatype.fromvalue("json") >>> d = dt.read("{}") >>> d["a"] = '123' >>> dt.formatted(d) '{"a": "123"}' Additional constraints on JSON data can be imposed by specifying `JSON Schema <https://json-schema.org/>`_ documents as `format` annotation: .. code-block:: python >>> from csvw import Datatype >>> dt = Datatype.fromvalue({"base": "json", "format": '{"type": "object"}'}) >>> dt.read('{}') OrderedDict() >>> dt.read('4') ... jsonschema.exceptions.ValidationError: 4 is not of type 'object' ... ValueError: invalid lexical value for json: 4 .. note:: To ensure proper roundtripping, we load the JSON strings using the `object_pairs_hook=collections.OrderedDict` keyword. """ name = 'json' example = '{"a": [1,2]}' @staticmethod def derived_description(datatype: "csvw.Datatype") -> dict: if datatype.format: try: schema = _json.loads(datatype.format) try: jsonschema.validate({}, schema=schema) return {'schema': schema} except jsonschema.ValidationError: return {'schema': schema} except jsonschema.SchemaError: warnings.warn('Invalid JSON schema as datatype format') except _json.JSONDecodeError: pass return {} # FIXME: ignored **kw? # why not just to_python = staticmethod(_json.loads)? @staticmethod def to_python(v, schema=None, **kw): res = _json.loads(v, object_pairs_hook=collections.OrderedDict) if schema: try: jsonschema.validate(res, schema=schema) except jsonschema.ValidationError: json.value_error(v) return res @staticmethod def to_string(v, **kw): return _json.dumps(v)
def dt_format_and_regex(fmt, no_date=False): """ .. seealso:: http://w3c.github.io/csvw/syntax/#formats-for-dates-and-times """ if fmt is None: return {'fmt': None, 'tz_marker': None, 'regex': None, 'pattern': None} if isinstance(fmt, dict) and list(fmt.keys()) == ['pattern']: fmt = fmt['pattern'] pattern = fmt # First, we strip off an optional timezone marker: tz_marker = None match = re.search('(?P<marker> ?[xX]{1,3})$', fmt) if match: tz_marker = match.group('marker') if len(set(tz_marker.strip())) != 1: # mixing x and X is not allowed! raise ValueError(fmt) fmt = fmt[:match.start()] date_patterns = { "yyyy-MM-dd", # e.g., 2015-03-22 "yyyyMMdd", # e.g., 20150322 "dd-MM-yyyy", # e.g., 22-03-2015 "d-M-yyyy", # e.g., 22-3-2015 "MM-dd-yyyy", # e.g., 03-22-2015 "M-d-yyyy", # e.g., 3-22-2015 "dd/MM/yyyy", # e.g., 22/03/2015 "d/M/yyyy", # e.g., 22/3/2015 "MM/dd/yyyy", # e.g., 03/22/2015 "M/d/yyyy", # e.g., 3/22/2015 "dd.MM.yyyy", # e.g., 22.03.2015 "d.M.yyyy", # e.g., 22.3.2015 "MM.dd.yyyy", # e.g., 03.22.2015 "M.d.yyyy", # e.g., 3.22.2015 } time_patterns = {"HH:mm:ss", "HHmmss", "HH:mm", "HHmm"} # We map dateTime component markers to corresponding fromat specs and regular # expressions used for formatting and parsing. translate = { 'yyyy': ('{dt.year:04d}', '(?P<year>[0-9]{4})'), 'MM': ('{dt.month:02d}', '(?P<month>[0-9]{2})'), 'dd': ('{dt.day:02d}', '(?P<day>[0-9]{2})'), 'M': ('{dt.month}', '(?P<month>[0-9]{1,2})'), 'd': ('{dt.day}', '(?P<day>[0-9]{1,2})'), 'HH': ('{dt.hour:02d}', '(?P<hour>[0-9]{2})'), 'mm': ('{dt.minute:02d}', '(?P<minute>[0-9]{2})'), 'ss': ('{dt.second:02d}', '(?P<second>[0-9]{2})'), } for dt_sep in ' T': # Only a single space or "T" may separate date and time format. # Since space or "T" isn't allowed anywhere else in the format, checking whether # we are dealing with a date or dateTime format is simple: if dt_sep in fmt: break else: dt_sep = None if dt_sep: dfmt, tfmt = fmt.split(dt_sep) elif no_date: dfmt, tfmt = None, fmt else: dfmt, tfmt = fmt, None msecs = None # The maximal number of decimal places for fractions of seconds. if tfmt and '.' in tfmt: # There is a microseconds marker. tfmt, msecs = tfmt.split('.') # Strip it off ... if set(msecs) != {'S'}: # ... make sure it's valid ... raise ValueError(fmt) msecs = len(msecs) # ... and store it's length. # Now we can check whether the bare date and time formats are valid: if (dfmt and dfmt not in date_patterns) or (tfmt and tfmt not in time_patterns): raise ValueError(fmt) regex, format = '', '' # Initialize the output. if dfmt: for d_sep in '.-/': # Determine the separator used for date components. if d_sep in dfmt: break else: d_sep = None if d_sep: # Iterate over date components, converting them to string format specs and regular # expressions. for i, part in enumerate(dfmt.split(d_sep)): if i > 0: format += d_sep regex += re.escape(d_sep) f, r = translate[part] format += f regex += r else: for _, chars in itertools.groupby(dfmt, lambda k: k): f, r = translate[''.join(chars)] format += f regex += r if dt_sep: format += dt_sep regex += re.escape(dt_sep) if tfmt: # For time components the only valid separator is ":". if ':' in tfmt: for i, part in enumerate(tfmt.split(':')): if i > 0: format += ':' regex += re.escape(':') f, r = translate[part] format += f regex += r else: for _, chars in itertools.groupby(tfmt, lambda k: k): f, r = translate[''.join(chars)] format += f regex += r # Fractions of seconds are a bit of a problem, because datetime objects only offer # microseconds. if msecs: format += '.{microsecond:.%s}' % msecs regex += r'(\.(?P<microsecond>[0-9]{1,%s})(?![0-9]))?' % msecs regex += r'(\.(?P<extramicroseconds>[0-9]{%s,})(?![0-9]))?' % (msecs + 1,) return {'regex': re.compile(regex), 'fmt': format, 'tz_marker': tz_marker, 'pattern': pattern} class NumberPattern: """ Implementations MUST recognise number format patterns containing the symbols 0, #, the specified decimalChar (or "." if unspecified), the specified groupChar (or "," if unspecified), E, +, % and ‰. The number of # placeholder characters before the decimal do not matter, since no limit is placed on the maximum number of digits. There should, however, be at least one zero someplace in the pattern. """ def __init__(self, pattern): assert pattern.count(';') <= 1 self.positive, _, self.negative = pattern.partition(';') if not self.negative: self.negative = '-' + self.positive.replace('+', '') @property def primary_grouping_size(self): comps = self.positive.split('.')[0].split(',') if len(comps) > 1: return comps[-1].count('#') + comps[-1].count('0') @property def secondary_grouping_size(self): comps = self.positive.split('.')[0].split(',') if len(comps) > 2: return comps[1].count('#') + comps[1].count('0') return self.primary_grouping_size @property def min_digits_before_decimal_point(self): integral_part = self.positive.split('.')[0] match = re.search('([0]+)$', integral_part) if match: return len(match.groups()[0]) @property def exponent_digits(self): _, _, exponent = self.positive.lower().partition('e') i = 0 for c in exponent: if c in '0#': i += 1 elif c in ',': continue else: break return i @property def decimal_digits(self): i = 0 _, _, decimal_part = self.positive.partition('.') for c in decimal_part: if c in '#0': i += 1 if c == 'E': break return i @property def significant_decimal_digits(self): i = 0 _, _, decimal_part = self.positive.partition('.') for c in decimal_part: if c == '0': i += 1 if c in ['E', '#']: break return i def is_valid(self, s): def digits(ss): return [c for c in ss if c not in '.,E+-%‰'] integral_part, _, decimal_part = s.partition('.') decimal_part, _, exponent = decimal_part.lower().partition('e') groups = integral_part.split(',') significant, leadingzero, skip = [], False, True for c in ''.join(groups): if c in ['+', '-', '%', # fixme: permil ]: continue if c == '0' and skip: leadingzero = True continue if c != '0': skip = False significant.append(c) if not significant and leadingzero: significant = ['0'] if self.min_digits_before_decimal_point and \ len(significant) < self.min_digits_before_decimal_point: return False if self.primary_grouping_size and groups: if len(digits(groups[-1])) > self.primary_grouping_size: return False if len(groups) > 1 and len(digits(groups[-1])) < self.primary_grouping_size: return False if self.secondary_grouping_size and len(groups) > 1: for i, group in enumerate(groups[:-1]): if i == 0: if len(digits(group)) > self.secondary_grouping_size: return False else: if len(digits(group)) != self.secondary_grouping_size: return False if decimal_part: if len(digits(decimal_part)) > self.decimal_digits: return False if self.significant_decimal_digits: if (not decimal_part) or (len(digits(decimal_part)) < self.significant_decimal_digits): return False if self.exponent_digits and 'e' in s.lower(): if len(digits(s.lower().split('e')[-1])) > self.exponent_digits: return False return True