Source code for ternip.rule_engine.normalisation_rule_engine

#!/usr/bin/env python

import re
from ternip.rule_engine.normalisation_rule import NormalisationRule
from ternip.rule_engine.normalisation_rule_block import NormalisationRuleBlock
from ternip.rule_engine.rule_engine import RuleEngine, RuleLoadError

[docs]class NormalisationRuleEngine(RuleEngine):
    """
    A class which does normalisation using a rule engine
    
    Complex rules must have a string member called 'id', which is used for
    after ordering, a list of strings called 'after' (which can be an empty
    list) which consists of IDs that must have run before this rule.
    Additionally, a function called 'apply' which takes a list of
    (token, pos, timexes) tuples and returns them in the same form with
    potentially modified timexes.
    """

    _block_type = NormalisationRuleBlock

    def _load_rule(self, filename, rulelines):
        """
        Load a 'simple' normalisation rule
        """

        # get key/value dictionaries
        d = self._parse_rule(filename, rulelines)

        # Set defaults
        type = None
        match = None
        id = filename
        value = None
        guards = []
        before_guards = []
        after_guards = []
        sent_guards = []
        after = []
        tokenise = True
        deliminate_numbers = False
        change_type = None
        freq = None
        quant = None
        mod = None

        for key in d:
            # Only one 'Type' field allowed
            if key == 'type':
                if len(d[key]) != 1:
                    raise RuleLoadError(filename, "Too many 'Type' field")
                else:
                    type = d[key][0]

            # Only one 'Match' field allowed
            elif key == 'match':
                if len(d[key]) != 1:
                    raise RuleLoadError(filename, "There must be exactly 1 'Match' field")
                else:
                    match = d[key][0]

            # No more than one ID key allowed
            elif key == 'id':
                if len(d[key]) == 1:
                    id = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'ID' fields")

            # No more than one Value key allowed
            elif key == 'value':
                if len(d[key]) == 1:
                    value = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Value' fields")

            # No more than one Change-Type key allowed
            elif key == 'change-type':
                if len(d[key]) == 1:
                    change_type = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Change-Type' fields")

            # No more than one Freq key allowed
            elif key == 'freq':
                if len(d[key]) == 1:
                    freq = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Freq' fields")

            # No more than one Quant key allowed
            elif key == 'quant':
                if len(d[key]) == 1:
                    quant = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Quant' fields")

            # No more than one Mod key allowed
            elif key == 'mod':
                if len(d[key]) == 1:
                    mod = d[key][0]
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Mod' fields")

            # set optional fields
            elif key == 'guard':
                guards = d[key]
            elif key == 'after':
                after = d[key]
            elif key == 'before-guard':
                before_guards = d[key]
            elif key == 'after-guard':
                after_guards = d[key]
            elif key == 'sent-guard':
                sent_guards = d[key]

            elif key == 'tokenise':
                if len(d[key]) == 1:
                    tokenise = d[key][0].lower()
                    if tokenise == 'true':
                        tokenise = True
                    elif tokenise == 'space':
                        tokenise = ' '
                    elif tokenise == 'null':
                        tokenise = ''
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Tokenise' fields")

            # Deliminate-Numbers is an optional field, defaulting to False, which
            # accepts either true or false (case-insensitive) as values
            elif key == 'deliminate-numbers':
                if len(d[key]) == 1:
                    deliminate_numbers = d[key][0].lower()
                    if deliminate_numbers == 'true':
                        deliminate_numbers = True
                    elif deliminate_numbers == 'false':
                        deliminate_numbers = False
                    else:
                        raise RuleLoadError(filename, "Deliminate-Numbers must be either 'True' or 'False'")
                elif len(d[key]) > 1:
                    raise RuleLoadError(filename, "Too many 'Deliminate-Numbers' fields")

            # error on unknown fields
            else:
                raise RuleLoadError(filename, "Unknown field '" + key + "'")

        if match is None:
            raise RuleLoadError(filename, "'Match' is a compulsory field")

        if deliminate_numbers and not tokenise:
            raise RuleLoadError(filename, "'Deliminate-Numbers' can not be set if Tokenise is")

        # Guard against any RE errors
        try:
            return NormalisationRule(match, type, id, value, change_type, freq, quant, mod, guards, after_guards,
                before_guards, sent_guards, after, tokenise, deliminate_numbers)
        except re.error as e:
            raise RuleLoadError(filename, "Malformed regular expression: " + str(e))

[docs]    def annotate(self, sents, dct):
        """
        This annotates all the timexes in the sents. dct means the document
        creation time (in the TIDES-modified ISO8601 format), which some rules
        may use to determine a context.
        """

        # Current context
        context_dt = dct

        # Timex's can't span sentence boundaries, but rules can alter the
        # text context for later sentences, so consider each sentence in turn,
        # updating the context if need be.
        for sent in sents:
            # Now collect all timexes in this sentence
            timexes = set()
            for (w, pos, ts) in sent:
                for t in ts:
                    timexes.add(t)

            # Now annotate each timex
            for timex in timexes:
                # First find the token extent of this timex
                tfound = False
                i = 0
                for (w, pos, ts) in sent:
                    if timex in ts:
                        if not tfound:
                            tfound = True
                            ei = i
                        ej = i + 1
                    i += 1

                # Slice up into different extents
                before = sent[:ei]
                body = sent[ei:ej]
                after = sent[ej:]

                # Now run the rules
                rules_run = set()
                rules_to_run = set(self._rules)

                # Apply rules until all rules have been applied
                while rules_to_run:
                    for rule in rules_to_run.copy():
                        # Check that if 'after' is defined, the rules we must run
                        # after have run
                        after_ok = True
                        for aid in rule.after:
                            if aid not in rules_run:
                                after_ok = False

                        # Apply this rule, and update our states of rules waiting to
                        # run and rules that have been run
                        if after_ok:
                            (success, context_dt) = rule.apply(timex, context_dt, dct, body, before, after)
                            rules_run.add(rule.id)
                            rules_to_run.remove(rule)