Source code for ternip.rule_engine.normalisation_functions.words_to_num

#!/usr/bin/env python

from operator import itemgetter

import re


_word_to_num = {
    "zero": 0,
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5,
    "six": 6,
    "seven": 7,
    "eight": 8,
    "nine": 9,
    "ten": 10,
    "eleven": 11,
    "twelve": 12,
    "thirteen": 13,
    "fourteen": 14,
    "fifteen": 15,
    "sixteen": 16,
    "seventeen": 17,
    "eighteen": 18,
    "nineteen": 19,
    "twenty": 20,
    "thirty": 30,
    "forty": 40,
    "fifty": 50,
    "sixty": 60,
    "seventy": 70,
    "eighty": 80,
    "ninety": 90,
    "hundred": 100,
    "thousand": 1000,
    "million": 1000000,
    "billion": 1000000000,
    "trillion": 1000000000000
}


[docs]def words_to_num(words):
    """
    Converted from GUTime. Given a string of number words, attempts to derive
    the numerical value of those words. Returns an integer.
    """

    # Get out quickly if we're passed in a group that doesn't match
    if words is None:
        return 0

    # If this comes from deliminated numbers
    words = re.sub(r'NUM_START', r'', words).strip()
    words = re.sub(r'NUM_END', r'', words).strip()

    # Clean up our input
    words = words.lower()

    # Get rid of tokens
    words = re.sub(r'<([^~]*)[^>]*>', r'\1 ', words).strip()

    # Superfluous white space
    words = words.strip()

    # Hyphenated number words
    words = re.sub(r'-', '', words)

    # Number word separators
    words = re.sub(r',', '', words)
    words = re.sub(r'\sand', '', words)

    # "a" and "the" mean one, really
    words = re.sub(r'^a', 'one', words)
    words = re.sub(r'^the', 'one', words)

    # convert to list
    words = words.split()

    # now attempt to convert each word to it's numerical equivalent
    for i in range(len(words)):
        if words[i] in _word_to_num:
            words[i] = _word_to_num[words[i]]
        elif words[i] in _ordinal_to_num and len(words) - 1 == i:
            # only allow ordinal words in the last position
            words[i] = ordinal_to_num(words[i])
        else:
            # Hope it's a number. If not, we error out
            try:
                words[i] = int(words[i])
            except ValueError:
                return 0

    # Now recursively break down these
    return _words_to_num(words)


def _words_to_num(nums):
    """
    Recursively break down a number string into individual number components,
    compute the value of those components (basically, the bit before the largest
    number, and the bit after) and then put it all back together.
    """

    # base case
    if len(nums) == 1:
        return nums[0]

    # find highest number in string
    (highest_num, highest_num_i) = max(zip(nums, range(len(nums))), key=itemgetter(0))
    before = nums[:highest_num_i]
    after = nums[highest_num_i + 1:]

    # If there are no numbers before the biggest term, then assume it means 1 of
    # those units
    if len(before) > 0:
        before = _words_to_num(before)
    else:
        before = 1

    # if there are no numbers after, then append 0 to it
    if len(after) > 0:
        after = _words_to_num(after)
    else:
        after = 0

    return (before * highest_num) + after

# Mapping of ordinals to numbers
_ordinal_to_num = {
    "first": 1,
    "second": 2,
    "third": 3,
    "fourth": 4,
    "fifth": 5,
    "sixth": 6,
    "seventh": 7,
    "eighth": 8,
    "ninth": 9,
    "tenth": 10,
    "eleventh": 11,
    "twelfth": 12,
    "thirteenth": 13,
    "fourteenth": 14,
    "fifteenth": 15,
    "sixteenth": 16,
    "seventeenth": 17,
    "eighteenth": 18,
    "nineteenth": 19,
    "twentieth": 20,
    "twenty-first": 21,
    "twenty-second": 22,
    "twenty-third": 23,
    "twenty-fourth": 24,
    "twenty-fifth": 25,
    "twenty-sixth": 26,
    "twenty-seventh": 27,
    "twenty-eighth": 28,
    "twenty-ninth": 29,
    "thirtieth": 30,
    "thirty-first": 31
}


[docs]def ordinal_to_num(o):
    """
    Given an ordinal (i.e., thirty-first or second) in the range 1st-31st (both
    numbers and words accepted), return the number value of that ordinal.
    Unrecognised data gets 1. Returns an integer
    """
    match = re.search(r'\d+', o)
    if match is not None:
        return int(match.group())
    elif o.lower() in _ordinal_to_num:
        return _ordinal_to_num[o.lower()]
    else:
        return 1