#!/usr/bin/env python
import re
from ternip.rule_engine.rule import Rule
from ternip.timex import Timex
[docs]class RecognitionRule(Rule):
""" A class that represents identification rules """
# If debug mode is enabled, then the comment in the TIMEX tag is set to
# the ID of the rule which created it
_DEBUG = False
def __init__(self, match, type, id, guards=None, after_guards=None, before_guards=None, after=None, squelch=False,
case_sensitive=False, deliminate_numbers=False):
"""
Create a recognition rule, with a number of optional arguments. All
regex's are in the form to be used with nltk.TokenSearcher.findall
(http://nltk.googlecode.com/svn/trunk/doc/api/nltk.text.TokenSearcher-class.html#findall)
however with the amendment that the body of the tokens are actually in
the form <token~POS>, e.g., <about~.+> would match about with any POS
tag.
match is a regex. The text that is matched by this regex is annotated as
a timex. Compulsory.
type can be date, time or duration (TIMEX3 annotation guidelines). This
is a compulsory value.
id is a unique value other rules can refer to in order to express an
ordering.
guards is a list of regexes which must be satisfied for this rule to be
applied. Defauts to an empty list. If the first character in the
regex is a !, then it means that it's a negative guard - the guard
must NOT match for this rule to be applied.
after_guards is a list of regexes, like normal guards, but is only
matched against the string immediately proceeding a match to check
if that is satisfied
before_guards is like after_guards, but matches against the string
immediately preceeding a match
after is a list of IDs which must have preceeded the execution of this
rule
squelch is a Boolean. If true, then if the 'match' regex matches some
stuff that's already been timex annotated, those timexes are removed
and no timex is added to the match. Defaults to false.
case_sensitive is a Boolean indicating whether or not this rule should
be matched case sensitively or not.
deliminate_numbers is a Boolean indicating whether or not this rule
requires the sentence to have deliminated numbers
"""
if not after: after = []
if not before_guards: before_guards = []
if not after_guards: after_guards = []
if not guards: guards = []
self.id = id
self._type = type
if case_sensitive:
self._match = re.compile(self._prep_re(match))
else:
self._match = re.compile(self._prep_re(match), re.IGNORECASE)
self._squelch = squelch
self.after = after
self._deliminate_numbers = deliminate_numbers
# Load guards
self._guards = self._load_guards(guards)
self._before_guards = self._load_guards(before_guards)
self._after_guards = self._load_guards(after_guards)
[docs] def apply(self, sent):
"""
Applies this rule to the tokenised sentence. The 'after' ordering
must be checked by the caller to ensure correct rule application.
sent is a list of tuples (token, POS, [timexes])
A tuple is returned where the first element is a list in the same form
as sent, with additional timexes added to the 3rd element if need be,
and the second element in the tuple is whether or not this rule matched
anything
"""
senttext = self._toks_to_str(sent)
if self._deliminate_numbers:
senttext = self._do_deliminate_numbers(senttext)
success = False
# Ensure the sentence-level guards are satisfied
if not self._check_guards(senttext, self._guards):
return sent, success
# Now see if this rule actually matches anything
for match in self._match.finditer(senttext):
# Now check before guards
if not self._check_guards(senttext[:match.start()], self._before_guards):
continue
# and after guards
if not self._check_guards(senttext[match.end():], self._after_guards):
continue
# okay, first we need to find which tokens we matched, can do this
# by using our token markers
ti = senttext.count('<', 0, match.start())
tj = senttext.count('<', 0, match.end())
if not self._squelch:
t = Timex(self._type) # only create a new timex if not squelching
if self._DEBUG:
t.comment = self.id
else:
t = None
# Add TIMEX
self._set_timex_extents(t, sent, ti, tj, self._squelch)
success = True
return sent, success