Source code for ternip.formats.timeml

#!/usr/bin/env python

import xml.dom.minidom
from timex3 import Timex3XmlDocument


[docs]class TimeMlDocument(Timex3XmlDocument):
    """
    A class which holds a TimeML representation of a document.
    
    Suitable for use with the AQUAINT dataset.
    """

    @staticmethod
[docs]    def create(sents, tok_offsets=None, add_S=False, add_LEX=False, pos_attr=False):
        """
        Creates a TimeML document from the internal representation
        
        sents is the [[(word, pos, timexes), ...], ...] format.
        
        tok_offsets is used to correctly reinsert whitespace lost in
        tokenisation. It's in the format of a list of lists of integers, where
        each integer is the offset from the start of the sentence of that token.
        If set to None (the default), then a single space is assumed between
        all tokens.
        
        If add_S is set to something other than false, then the tags to indicate
        sentence boundaries are added, with the name of the tag being the value
        of add_S
        
        add_LEX is similar, but for token boundaries
        
        pos_attr is similar but refers to the name of the attribute on the LEX
        (or whatever) tag that holds the POS tag.
        """

        # Create a blank XML document
        impl = xml.dom.minidom.getDOMImplementation()
        doc = impl.createDocument('http://www.timeml.org/site/publications/timeMLdocs/timeml_1.2.1.dtd', 'TimeML', None)

        # Add text to document
        TimeMlDocument._add_words_to_node_from_sents(doc, doc.documentElement, sents, tok_offsets)

        # Now create the object
        x = TimeMlDocument(doc)

        # Now reconcile the S, LEX and TIMEX tags
        x.reconcile(sents, add_S, add_LEX, pos_attr)

        return x