Source code for ternip.rule_engine.rule

import re
from ternip.rule_engine import expressions

[docs]class Rule(object): """ Base class for recognition and normalisation rules """ def _prep_re(self, exp, tokenise=True): """ Prepare a regular expression which uses <> for token boundaries. Also, substitute special constants for expressions which can be used in Match and Guard regular expressions: $ORDINAL_WORDS - which consist of word forms of ordinal values, $ORDINAL_NUMS - the number forms (including suffixes) of ordinal values, $DAYS - day names $MONTHS - month names $MONTH_ABBRS - three-letter abbreviations of month names $RELATIVE_DAYS - relative expressions referring to days $DAY_HOLIDAYS $NTH_DOW_HOLIDAYS $FIXED_HOLIDAYS - holidays which have a fixed date $LUNAR_HOLIDAYS - holidays which are relative to Easter """ exp = re.sub(r'\$ORDINAL_WORDS', expressions.ORDINAL_WORDS, exp) exp = re.sub(r'\$ORDINAL_NUMS', expressions.ORDINAL_NUMS, exp) exp = re.sub(r'\$DAYS', expressions.DAYS, exp) exp = re.sub(r'\$MONTHS', expressions.MONTHS, exp) exp = re.sub(r'\$MONTH_ABBRS', expressions.MONTH_ABBRS, exp) exp = re.sub(r'\$RELATIVE_DAYS', expressions.RELATIVE_DAYS, exp) exp = re.sub(r'\$DAY_HOLIDAYS', expressions.DAY_HOLIDAYS, exp) exp = re.sub(r'\$NTH_DOW_HOLIDAYS', expressions.NTH_DOW_HOLIDAYS, exp) exp = re.sub(r'\$FIXED_HOLIDAYS', expressions.FIXED_HOLIDAYS, exp) exp = re.sub(r'\$LUNAR_HOLIDAYS', expressions.LUNAR_HOLIDAYS, exp) exp = re.sub(r'\$UNITS', expressions.UNITS, exp) if tokenise is True: # This code is modified from NLTK's text.py for dealing with pattern # matching with tokenised strings, under the Apache License 2.0 # Natural Language Toolkit (NLTK) http://www.nltk.org/ # Copyright (C) 2001-2010 NLTK Project # Bird, Steven, Edward Loper and Ewan Klein (2009). # Natural Language Processing with Python. O'Reilly Media Inc. exp = re.sub(r'\s', '', exp) exp = re.sub(r'<', '(?:<(?:', exp) exp = re.sub(r'>', ')>)', exp) exp = re.sub(r'(?<!\\)\.', '[^>]', exp) # End NLTK contribution # Fix for NUM_START/NUM_ORD_START which really wants to match on ., but # in a non-greedy way exp = re.sub(r'_START\[\^>\]', '_START(?:.(?!NUM_START))', exp) return exp def _toks_to_str(self, toks): """ Takes a list of (token, pos_tag, timexes) and converts it into the <token~pos> format for matching """ # This code is modified from NLTK's text.py for dealing with pattern # matching with tokenised strings, under the Apache License 2.0 # Natural Language Toolkit (NLTK) http://www.nltk.org/ # Copyright (C) 2001-2010 NLTK Project # Bird, Steven, Edward Loper and Ewan Klein (2009). # Natural Language Processing with Python. O'Reilly Media Inc. return ''.join('<' + w + '~' + pos + '>' for (w, pos, ts) in toks) # End NLTK contribution def _load_guards(self, guards, tokenise=True): """ Given a list of regexs, return a tuple of REs representing positive and negative guards. """ pos = [] neg = [] for guard in guards: if guard[0] == '!': neg.append(re.compile(self._prep_re(guard[1:], tokenise), re.IGNORECASE)) else: pos.append(re.compile(self._prep_re(guard, tokenise), re.IGNORECASE)) return (pos, neg) def _check_guards(self, to_check, (pos, neg)): """ Given some text to check, and a tuple of positive and negative rules, check whether that text satisfies those guards """ # first check positive rules for guard in pos: if not guard.search(to_check): return False # then negative rules for guard in neg: if guard.search(to_check): return False return True def _do_deliminate_numbers(self, sent): """ Translation of GUTime function 'deliminateNumbers' - marks up number sequences """ rest = sent sent = '' previous_word = '' current_word = '' in_number = False while re.search(r'<[a-zA-Z-]+~.+?>', rest): m = re.search(r'<(?P<word>[a-zA-Z-]+)~(?P<pos>.+?)>', rest) sent += m.string[:m.start()] rest = m.string[m.end():] current_word = m.group('word') # Get next word n = re.search(r'<(?P<word>[a-zA-Z-]+)~(?P<pos>.+?)>', rest) if n is not None: next_word = n.group('word') else: next_word = '' # the following deals reasonably well with hypenated numbers like "twenty-one" if re.match(expressions.NUMBER_TERM + '(-' + expressions.NUMBER_TERM + ')*', current_word, re.I) is not None: # This current word is identified as a number if not in_number: # first in (possible) series of numbers to_add = 'NUM_START<' + m.group('word') + '~' + m.group('pos') + '>' in_number = True else: # either not first in series, or between ordinal and regular nums (i.e. "first two") if (re.search(expressions.ORD_UNIT_NUMS + r'$', previous_word) is not None) or (re.search(expressions.ORD_OTHER_NUMS + r'$', previous_word) is not None): # between ordinal and regular sent = re.sub(r'(NUM_START((.(?!NUM_START))*))$', r'NUM_ORD_START\2', sent) # replace with NUM_ORD_START sent += 'NUM_ORD_END' to_add = 'NUM_START<' + m.group('word') + '~' + m.group('pos') + '>' else: # number is continuing to_add = '<' + m.group('word') + '~' + m.group('pos') + '>' else: # current word is not a number if in_number: # previous word was a number # following works fairly well...it avoids marking things like "six and two" as a single # number while still marking things like "two hundred and one" as a single number if (current_word.lower() == 'and') and\ (re.search(expressions.HIGHER_NUMS, previous_word, re.I) is not None) and\ ((re.search(expressions.UNIT_NUMS, next_word, re.I) is not None) or (re.search(expressions.UNIQUE_NUMS, next_word, re.I) is not None) or (re.search( expressions.TENS_NUMS + '(-' + expressions.UNIT_NUMS + '|' + expressions.ORD_UNIT_NUMS + ')?', next_word, re.I) is not None) or (re.search(expressions.ORD_UNIT_NUMS, next_word, re.I) is not None) or (re.search(expressions.ORD_OTHER_NUMS, next_word, re.I) is not None)): to_add = '<' + m.group('word') + '~' + m.group('pos') + '>' else: # number doesn't continue in_number = False if (re.search(expressions.ORD_UNIT_NUMS + r'$', previous_word) is not None) or ( re.search(expressions.ORD_OTHER_NUMS + r'$', previous_word) is not None): sent = re.sub(r'(NUM_START((.(?!NUM_START))*))$', r'NUM_ORD_START\2', sent) # replace with NUM_ORD_START sent += 'NUM_ORD_END' else: sent += 'NUM_END' to_add = '<' + m.group('word') + '~' + m.group('pos') + '>' else: to_add = '<' + m.group('word') + '~' + m.group('pos') + '>' sent += to_add previous_word = current_word if re.match(expressions.NUMBER_TERM + '(-' + expressions.NUMBER_TERM + ')*', current_word, re.I) is not None: # final word is a number sent += 'NUM_END' sent += rest return sent def _set_timex_extents(self, t, sent, ti, tj, squelch): """ Inserts the timex t in the appropriate points in the sentence sent (i.e., between the extents ti, tj). If squelch is set, remove timexes between those extents. """ for i in range(len(sent)): # now get all tokens in the range and add the new timex if needed if i >= ti and i < tj: if squelch: # in the case of this being a squelch rule, remove the # timexes sent[i] = (sent[i][0], sent[i][1], set()) else: # otherwise add the new timex to the list of timexes # associated with this token sent[i][2].add(t)