Current File : /home/jvzmxxx/wiki1/extensions/EventLogging/server/eventlogging/jrm.py
# -*- coding: utf-8 -*-
"""
  eventlogging.jrm
  ~~~~~~~~~~~~~~~~

  This module provides a simple object-relational mapper for JSON
  schemas and the objects they describe (hence 'jrm').

"""
from __future__ import division, unicode_literals

import collections
import datetime
import itertools
import logging
import _mysql
import sqlalchemy

from .compat import items
from .schema import get_schema
from .utils import flatten


__all__ = ('store_sql_events',)


# Format string for :func:`datetime.datetime.strptime` for MediaWiki
# timestamps. See `<https://www.mediawiki.org/wiki/Manual:Timestamp>`_.
MEDIAWIKI_TIMESTAMP = '%Y%m%d%H%M%S'

# Format string for table names. Interpolates a `SCID` -- i.e., a tuple
# of (schema_name, revision_id).
TABLE_NAME_FORMAT = '%s_%s'

# An iterable of properties that should not be stored in the database.
NO_DB_PROPERTIES = ('recvFrom', 'revision', 'schema', 'seqId')

# A dictionary mapping database engine names to table defaults.
ENGINE_TABLE_OPTIONS = {
    'mysql': {
        'mysql_charset': 'utf8',
        'mysql_engine': 'InnoDB'
    }
}

# How long (in seconds) we should accumulate events before flushing
# to the database.
DB_FLUSH_INTERVAL = 2


class MediaWikiTimestamp(sqlalchemy.TypeDecorator):
    """A :class:`sqlalchemy.TypeDecorator` for MediaWiki timestamps."""

    # Timestamps are stored as VARCHAR(14) columns.
    impl = sqlalchemy.Unicode(14)

    def process_bind_param(self, value, dialect=None):
        """Convert an integer timestamp (specifying number of seconds or
        miliseconds since UNIX epoch) to MediaWiki timestamp format."""
        if value > 1e12:
            value /= 1000
        value = datetime.datetime.utcfromtimestamp(value).strftime(
            MEDIAWIKI_TIMESTAMP)
        if hasattr(value, 'decode'):
            value = value.decode('utf-8')
        return value

    def process_result_value(self, value, dialect=None):
        """Convert a MediaWiki timestamp to a :class:`datetime.datetime`
        object."""
        return datetime.datetime.strptime(value, MEDIAWIKI_TIMESTAMP)


# Maximum length for string and string-like types. Because InnoDB limits index
# columns to 767 bytes, the maximum length for a utf8mb4 column (which
# reserves up to four bytes per character) is 191 (191 * 4 = 764).
STRING_MAX_LEN = 191

# Default table column definition, to be overridden by mappers below.
COLUMN_DEFAULTS = {'type_': sqlalchemy.Unicode(STRING_MAX_LEN)}

# Mapping of JSON Schema attributes to valid values. Each value maps to
# a dictionary of options. The options are compounded into a single
# dict, which is then used as kwargs for :class:`sqlalchemy.Column`.
#
# ..note::
#
#   The mapping is keyed in order of increasing specificity. Thus a
#   JSON property {"type": "number", "format": "utc-millisec"} will
#   map onto a :class:`MediaWikiTimestamp` type, and not
#   :class:`sqlalchemy.Float`.
mappers = collections.OrderedDict((
    ('type', {
        'boolean': {'type_': sqlalchemy.Boolean},
        'integer': {'type_': sqlalchemy.BigInteger},
        'number': {'type_': sqlalchemy.Float},
        'string': {'type_': sqlalchemy.Unicode(STRING_MAX_LEN)},
    }),
    ('format', {
        'utc-millisec': {'type_': MediaWikiTimestamp, 'index': True},
        'uuid5-hex': {'type_': sqlalchemy.CHAR(32), 'index': True,
                      'unique': True},
    }),
    ('required', {
        True: {'nullable': False},
        False: {'nullable': True}
    })
))


def typecast(property):
    """Generates a SQL column definition from a JSON Schema property
    specifier."""
    options = COLUMN_DEFAULTS.copy()
    for attribute, mapping in items(mappers):
        value = property.get(attribute)
        options.update(mapping.get(value, ()))
    return sqlalchemy.Column(**options)


def get_table(meta, scid):
    """Acquire a :class:`sqlalchemy.schema.Table` object for a JSON
    Schema specified by `scid`."""
    #  +---------------------------------+
    #  | Is description of table present |
    #  | in Python's MetaData object?    |
    #  +----+----------------------+-----+
    #       |                      |
    #       no                     yes
    #       |                      |      +---------------------+
    #       |                      +----->| Assume table exists |
    #       v                             | in DB               |
    #  +--------------------------+       +-----------+---------+
    #  | Describe table structure |                   |
    #  | using schema.            |                   |
    #  +------------+-------------+                   |
    #               |                                 |
    #               v                                 |
    #  +---------------------------+                  |
    #  | Does a table so described |                  |
    #  | exist in the database?    |                  |
    #  +----+-----------------+----+                  |
    #       |                 |                       |
    #       no                yes                     |
    #       |                 |                       |
    #       v                 |                       |
    #   +--------------+      |                       |
    #   | CREATE TABLE |      |                       |
    #   +---+----------+      |                       v
    #       |                 |         +-------------+------------+
    #       +-----------------+-------->| Return table description |
    #                                   +--------------------------+
    try:
        return meta.tables[TABLE_NAME_FORMAT % scid]
    except KeyError:
        return declare_table(meta, scid)


def declare_table(meta, scid):
    """Map a JSON schema to a SQL table. If the table does not exist in
    the database, issue ``CREATE TABLE`` statement."""
    schema = get_schema(scid, encapsulate=True)

    columns = schema_mapper(schema)

    table_options = ENGINE_TABLE_OPTIONS.get(meta.bind.name, {})
    table_name = TABLE_NAME_FORMAT % scid

    table = sqlalchemy.Table(table_name, meta, *columns, **table_options)
    table.create(checkfirst=True)

    return table


def _insert_sequential(table, events, replace=False):
    """Insert events into the database by issuing an INSERT for each one."""
    for event in events:
        insert = table.insert(values=event)
        if replace:
            insert = (insert
                      .prefix_with('IGNORE', dialect='mysql')
                      .prefix_with('OR REPLACE', dialect='sqlite'))
        try:
            insert.execute()
        except sqlalchemy.exc.IntegrityError as e:
            # If we encouter a MySQL Duplicate key error,
            # just log and continue.
            if type(e.orig) == _mysql.IntegrityError and e.orig[0] == 1062:
                logging.error(e)
        except sqlalchemy.exc.ProgrammingError:
            table.create()
            insert.execute()


def _insert_multi(table, events, replace=False):
    """Insert events into the database using a single INSERT."""
    insert = table.insert(values=events)
    if replace:
        insert = (insert
                  .prefix_with('IGNORE', dialect='mysql')
                  .prefix_with('OR REPLACE', dialect='sqlite'))
    try:
        insert.execute()
    except sqlalchemy.exc.IntegrityError as e:
        # If we encouter a MySQL Duplicate key error,
        # just log and continue.  Note that this will
        # fail inserts for all events in this batch,
        # not just the one that caused a duplicate
        # key error.  In this case, should we just
        # call _insert_sequential() with these events
        # so that each event has a chance to be inserted
        # separately?
        if type(e.orig) == _mysql.IntegrityError and e.orig[0] == 1062:
            logging.error(e)
    except sqlalchemy.exc.SQLAlchemyError:
        table.create(checkfirst=True)
        insert.execute()


def insert_sort_key(event):
    """Sort/group function that batches events that have the same
    set of fields. Please note that schemas allow for optional fields
    that might/might not have been included in the event.
    """
    return tuple(sorted(event))


def store_sql_events(meta, events_batch, replace=False,
                     on_insert_callback=None):
    """Store events in the database.
    It assumes that the events come broken down by scid."""
    logger = logging.getLogger('Log')

    dialect = meta.bind.dialect
    if (getattr(dialect, 'supports_multivalues_insert', False) or
            getattr(dialect, 'supports_multirow_insert', False)):
        insert = _insert_multi
    else:
        insert = _insert_sequential

    while len(events_batch) > 0:
        scid, scid_events = events_batch.pop()
        prepared_events = [prepare(e) for e in scid_events]
        # TODO: Avoid breaking the inserts down by same set of fields,
        # instead force a default NULL, 0 or '' value for optional fields.
        prepared_events.sort(key=insert_sort_key)
        for _, grouper in itertools.groupby(prepared_events, insert_sort_key):
            events = list(grouper)
            table = get_table(meta, scid)
            insert(table, events, replace)
            # The insert operation is all or nothing - either all events have
            # been inserted successfully (sqlalchemy wraps the insertion in a
            # transaction), or an exception is thrown and it's not caught
            # anywhere. This means that if the following line is reached,
            # len(events) events have been inserted, so we can log it.
            logger.info('Data inserted %d', len(events))
            if on_insert_callback:
                on_insert_callback(len(events))


def _property_getter(item):
    """Mapper function for :func:`flatten` that extracts properties
    and their types from schema."""
    key, val = item
    if isinstance(val, dict):
        if 'properties' in val:
            val = val['properties']
        elif 'type' in val:
            val = typecast(val)
    return key, val


def prepare(event):
    """Prepare an event for insertion into the database."""
    flat_event = flatten(event)
    for prop in NO_DB_PROPERTIES:
        flat_event.pop(prop, None)
    return flat_event


def column_sort_key(column):
    """Sort key for column names. 'id' and 'uuid' come first, then the
    top-level properties in alphabetical order, followed by the nested
    properties (identifiable by the presence of an underscore)."""
    return (
        ('id', 'uuid', column.name).index(column.name),
        column.name.count('_'),
        column.name,
    )


def schema_mapper(schema):
    """Takes a schema and map its properties to database column
    definitions."""
    properties = {k: v for k, v in items(schema.get('properties', {}))
                  if k not in NO_DB_PROPERTIES}

    columns = []

    for name, col in items(flatten(properties, f=_property_getter)):
        col.name = name
        columns.append(col)

    columns.sort(key=column_sort_key)
    return columns