Source code for ternip.rule_engine.normalisation_rule

#!/usr/bin/env python

import calendar
import logging

import re

from ternip.rule_engine import rule
from ternip.rule_engine.expressions import *
from ternip.rule_engine.normalisation_functions.date_functions import *
from ternip.rule_engine.normalisation_functions.relative_date_functions import *
from ternip.rule_engine.normalisation_functions.string_conversions import *
from ternip.rule_engine.normalisation_functions.words_to_num import *


LOGGER = logging.getLogger(__name__)


[docs]class NormalisationRule(rule.Rule):
    """ A class that represents normalisation rules """

    # If debug mode is enabled, then the comment in the TIMEX tag is set to
    # the ID of the rule which normalised it
    _DEBUG = False

    def __init__(self, match, type=None, id='', value=None, change_type=None, freq=None, quant=None, mod=None,
                 guards=None, after_guards=None, before_guards=None, sent_guards=None, after=None, tokenise=True,
                 deliminate_numbers=False):
        """
        Create a normalisation rule, with a number of optional arguments. If
        tokenise is set to true, then regex's are in the form to be used with
        nltk.TokenSearcher.findall (http://nltk.googlecode.com/svn/trunk/doc/api/nltk.text.TokenSearcher-class.html#findall)
        however with the amendment that the body of the tokens are actually in
        the form <token~POS>, e.g., <about~.+> would match about with any POS
        tag.
        
        match is a regex which the body of the timex must match to run this
            rule. Subgroups of this expression are available to later
            expressions.
        type means the type of TIMEX which this rule applies to
        id is a unique string other rules can refer to in order to express an
            ordering.
        value is a Python expression which returns a value (in ISO 8601 format,
            as modified in TimeML). Subgroups from the match expression are
            available in the form {#[0-9]+}
        guard is a list of regexes which must be satisfied for this rule to be
            applied. Defauts to an empty list. If the first character in the
            regex is a !, then it means that it's a negative guard - the guard
            must NOT match for this rule to be applied.
        after_guards are like guards, but match against the text proceeding the
            annotation in the sentence
        before_guards are like after_guards, but match against preceeding text.
        after is a list of IDs which must have preceeded the execution of this
            rule
        tokenise is whether or not the regular expressions to be matched against
            care about token boundaries/POS tags. If it is not true, it is
            considered to be the separator for tokens.
        """
        if not after: after = []
        if not sent_guards: sent_guards = []
        if not before_guards: before_guards = []
        if not after_guards: after_guards = []
        if not guards: guards = []

        self.id = id
        self._type = type
        self._match = re.compile(self._prep_re(match, tokenise), re.IGNORECASE)
        self.after = after
        self._tokenise = tokenise
        self._deliminate_numbers = deliminate_numbers
        self._value_exp = self._compile_exp(value, 'value')
        self._type_exp = self._compile_exp(change_type, 'change-type')
        self._freq_exp = self._compile_exp(freq, 'freq')
        self._quant_exp = self._compile_exp(quant, 'quant')
        self._mod_exp = self._compile_exp(mod, 'mod')

        # Load guards
        self._guards = self._load_guards(guards, tokenise)
        self._before_guards = self._load_guards(before_guards, tokenise)
        self._after_guards = self._load_guards(after_guards, tokenise)
        self._sent_guards = self._load_guards(sent_guards, tokenise)

    def _compile_exp(self, exp, type):
        """
        Replace our group short form in value expressions, e.g., {#6} with
        actual Python code so that matched regular expressions get subbed in
        """
        # it would be nice to support named groups, but this'll do for now
        if exp is not None:
            return compile(re.sub(r'\{#(\d+)\}', r'match.group(\1)', exp), self.id + ':' + type, 'eval')
        else:
            return None

[docs]    def apply(self, timex, cur_context, dct, body, before, after):
        """
        Applies this rule to this timex, where body is the full extent covered
        by this timex, before is the preceeding text in the sentence, and after
        is the proceeding text in the sentence, in the [(token, POS), ...] form
        
        A boolean indicating whether or not application was successful is
        returned. The timex may also be modified, so should be passed in by
        reference.
        """

        # Check this rule type matches the timex type
        if self._type is not None and timex.type.lower() != self._type.lower():
            return False, cur_context

        # Check before, after and whole sentence guards
        if not self._check_guards(self._toks_to_str(before), self._before_guards):
            return False, cur_context

        if not self._check_guards(self._toks_to_str(after), self._after_guards):
            return False, cur_context

        if not self._check_guards(self._toks_to_str(body), self._guards):
            return False, cur_context

        if not self._check_guards(self._toks_to_str(before + body + after), self._sent_guards):
            return False, cur_context

        # Now, check if we match:
        if self._tokenise is True:
            senttext = self._toks_to_str(body)
            if self._deliminate_numbers:
                senttext = self._do_deliminate_numbers(senttext)
        else:
            senttext = self._tokenise.join([tok for (tok, pos, ts) in body])

        match = self._match.search(senttext)

        # If we do, then calculate attributes for the timex
        if match:
            if self._DEBUG:
                timex.comment = self.id

            try:
                if self._value_exp is not None:
                    timex.value = eval(self._value_exp)

                if self._type_exp is not None:
                    timex.type = eval(self._type_exp)

                if self._freq_exp is not None:
                    timex.freq = eval(self._freq_exp)

                if self._quant_exp is not None:
                    timex.quant = eval(self._quant_exp)

                if self._mod_exp is not None:
                    timex.mod = eval(self._mod_exp)

            except Exception:
                LOGGER.exception('Malformed rule expression')

            # Need to update current time context, if necessary
            return True, cur_context
        else:
            # Rule did not match
            return False, cur_context