Source code for ternip.rule_engine.recognition_rule

#!/usr/bin/env python

import re
from ternip.rule_engine.rule import Rule
from ternip.timex import Timex


[docs]class RecognitionRule(Rule):
    """ A class that represents identification rules """

    # If debug mode is enabled, then the comment in the TIMEX tag is set to
    # the ID of the rule which created it
    _DEBUG = False

    def __init__(self, match, type, id, guards=None, after_guards=None, before_guards=None, after=None, squelch=False,
                 case_sensitive=False, deliminate_numbers=False):
        """
        Create a recognition rule, with a number of optional arguments. All
        regex's are in the form to be used with nltk.TokenSearcher.findall
        (http://nltk.googlecode.com/svn/trunk/doc/api/nltk.text.TokenSearcher-class.html#findall)
        however with the amendment that the body of the tokens are actually in
        the form <token~POS>, e.g., <about~.+> would match about with any POS
        tag.
        
        match is a regex. The text that is matched by this regex is annotated as
            a timex. Compulsory.
        type can be date, time or duration (TIMEX3 annotation guidelines). This
            is a compulsory value.
        id is a unique value other rules can refer to in order to express an
            ordering.
        guards is a list of regexes which must be satisfied for this rule to be
            applied. Defauts to an empty list. If the first character in the
            regex is a !, then it means that it's a negative guard - the guard
            must NOT match for this rule to be applied.
        after_guards is a list of regexes, like normal guards, but is only
            matched against the string immediately proceeding a match to check
            if that is satisfied
        before_guards is like after_guards, but matches against the string
            immediately preceeding a match
        after is a list of IDs which must have preceeded the execution of this
            rule
        squelch is a Boolean. If true, then if the 'match' regex matches some
            stuff that's already been timex annotated, those timexes are removed
            and no timex is added to the match. Defaults to false.
        case_sensitive is a Boolean indicating whether or not this rule should
            be matched case sensitively or not.
        deliminate_numbers is a Boolean indicating whether or not this rule
            requires the sentence to have deliminated numbers
        """
        if not after: after = []
        if not before_guards: before_guards = []
        if not after_guards: after_guards = []
        if not guards: guards = []

        self.id = id
        self._type = type
        if case_sensitive:
            self._match = re.compile(self._prep_re(match))
        else:
            self._match = re.compile(self._prep_re(match), re.IGNORECASE)
        self._squelch = squelch
        self.after = after
        self._deliminate_numbers = deliminate_numbers

        # Load guards
        self._guards = self._load_guards(guards)
        self._before_guards = self._load_guards(before_guards)
        self._after_guards = self._load_guards(after_guards)

[docs]    def apply(self, sent):
        """
        Applies this rule to the tokenised sentence. The 'after' ordering
        must be checked by the caller to ensure correct rule application.
        
        sent is a list of tuples (token, POS, [timexes])
        
        A tuple is returned where the first element is a list in the same form
        as sent, with additional timexes added to the 3rd element if need be,
        and the second element in the tuple is whether or not this rule matched
        anything
        """

        senttext = self._toks_to_str(sent)

        if self._deliminate_numbers:
            senttext = self._do_deliminate_numbers(senttext)

        success = False

        # Ensure the sentence-level guards are satisfied
        if not self._check_guards(senttext, self._guards):
            return sent, success

        # Now see if this rule actually matches anything
        for match in self._match.finditer(senttext):
            # Now check before guards
            if not self._check_guards(senttext[:match.start()], self._before_guards):
                continue

            # and after guards
            if not self._check_guards(senttext[match.end():], self._after_guards):
                continue

            # okay, first we need to find which tokens we matched, can do this
            # by using our token markers
            ti = senttext.count('<', 0, match.start())
            tj = senttext.count('<', 0, match.end())

            if not self._squelch:
                t = Timex(self._type) # only create a new timex if not squelching
                if self._DEBUG:
                    t.comment = self.id
            else:
                t = None

            # Add TIMEX
            self._set_timex_extents(t, sent, ti, tj, self._squelch)
            success = True

        return sent, success