Source code for ternip.formats.xml_doc

import xml.dom.minidom
import logging

import nltk.tag
import nltk.tokenize

from ternip.timex import add_timex_ids

LOGGER = logging.getLogger(__name__)

[docs]class XmlDocument(object):
    """
    An abstract base class which all XML types can inherit from. This implements
    almost everything, apart from the conversion of timex objects to and from
    timex tags in the XML. This is done by child classes 
    """

    @staticmethod
    def _add_words_to_node_from_sents(doc, node, sents, tok_offsets=None):
        """
        Uses the given node and adds an XML form of sents to it. The node
        passed in should have no children (be an empty element)
        """

        # Just add text here, then leave it up to reconcile to add all other
        # tags

        s_offset = 0

        for i in range(len(sents)):
            for j in range(len(sents[i])):
                (tok, pos, ts) = sents[i][j]

                # Do we know what token offsets are in order to reinstate them?
                if tok_offsets is not None:
                    # Add whitespace between tokens if needed
                    while s_offset < tok_offsets[i][j]:
                        node.appendChild(doc.createTextNode(' '))
                        s_offset += 1

                # Add the text
                node.appendChild(doc.createTextNode(tok))

                # If we're not using token offsets, assume a single space is
                # what's used, except if this is the last element.
                if tok_offsets is None:
                    if not (i == len(sents) - 1 and j == len(sents[i]) - 1):
                        node.appendChild(doc.createTextNode(' '))
                else:
                    # Increase our current sentence offset
                    s_offset += len(tok)

        node.normalize()
        return node

    @staticmethod
[docs]    def create(sents, tok_offsets=None, add_S=False, add_LEX=False, pos_attr=False):
        """
        This is an abstract function for building XML documents from the
        internal representation only. You are not guaranteed to get out of
        get_sents what you put in here. Sentences and words will be retokenised
        and retagged unless you explicitly add S and LEX tags and the POS
        attribute to the document using the optional arguments.
        
        sents is the [[(word, pos, timexes), ...], ...] format.
        
        tok_offsets is used to correctly reinsert whitespace lost in
        tokenisation. It's in the format of a list of lists of integers, where
        each integer is the offset from the start of the sentence of that token.
        If set to None (the default), then a single space is assumed between
        all tokens.
        
        If add_S is set to something other than false, then the tags to indicate
        sentence boundaries are added, with the name of the tag being the value
        of add_S
        
        add_LEX is similar, but for token boundaries
        
        pos_attr is similar but refers to the name of the attribute on the LEX
        (or whatever) tag that holds the POS tag.
        """
        raise NotImplementedError

    def __init__(self, file, nodename=None, has_S=False, has_LEX=False, pos_attr=False):
        """
        Passes in an XML document (as one consecutive string) which is used
        as the basis for this object.
        
        Alternatively, you can pass in an xml.dom.Document class which means
        that it's not parsed. This is used by the static create function.
        
        Node name is the name of the "body" of this document to be considered.
        If set to None (it's default), then the root node is considered to be
        the document body.
        
        has_S means that the document uses XML tags to mark sentence boundaries.
        This defaults to False, but if your XML document does, you should set it
        to the name of your sentence boundary tag (normally 'S').
        
        has_LEX is similar to has_S, but for token boundaries. Again, set this
        to your tag for token boundaries (not as common, but sometimes it's
        'lex')
        
        pos_attr is the name of the attribute on your LEX (or whatever) tags
        that indicates the POS tag for that token.
        
        The tagger needs tokenised sentences and tokenised and POS tagged tokens
        in order to be able to tag. If the input does not supply this data, the
        NLTK is used to fill the blanks. If this input is supplied, it is
        blindly accepted as reasonably sensible. If there are tokens which are
        not annotated (for whatever reason), then alignment between XML nodes
        and the results of the tagging may fail and give undesirable results.
        Similarly, if tokens are embedded inside other tokens, this will also
        error in likely undesirable way, and such a tagging is likely erroneous.
        """

        if isinstance(file, xml.dom.minidom.Document):
            self._xml_doc = file
        else:
            self._xml_doc = xml.dom.minidom.parseString(file)

        if nodename is None:
            self._xml_body = self._xml_doc.documentElement
        else:
            tags = self._xml_doc.getElementsByTagName(nodename)
            if len(tags) != 1:
                raise BadNodeNameError()
            self._xml_body = tags[0]

        self._has_S = has_S
        self._has_LEX = has_LEX
        self._pos_attr = pos_attr

    def _strip_tags(self, doc, tagname, node):
        """
        Recursively remove a tag from this node
        """

        # Recursive step - depth-first search
        for child in list(node.childNodes):
            # Get the list of nodes which replace this one (if any)
            rep = self._strip_tags(doc, tagname, child)

            if len(rep) == 1:
                # If it's a single node that's taking the place of this one (e.g.,
                # if there was no change, or a timex tag that only had some text
                # inside it), but only if the node's changed
                if rep[0] is not child:
                    node.replaceChild(rep[0], child)
                    node.normalize()
            else:
                # There were multiple child nodes, need to insert all of them
                # where in the same location, in order, where their parent
                # node was. Unfortunately replaceChild can't do replacement
                # of a node with multiple nodes.
                before = child.nextSibling
                node.removeChild(child)
                for new_node in rep:
                    node.insertBefore(new_node, before)
                node.normalize()

        # Base step
        if node.nodeType == node.ELEMENT_NODE and node.tagName == tagname:
            return [child for child in node.childNodes]
        else:
            return [node]

[docs]    def strip_tag(self, tagname):
        """
        Remove this tag from the document.
        """
        self._strip_tags(self._xml_doc, tagname, self._xml_body)

[docs]    def strip_timexes(self):
        """
        Strips all timexes from this document. Useful if we're evaluating the
        software - we can just feed in the gold standard directly and compare
        the output then.
        """
        self._strip_tags(self._xml_doc, self._timex_tag_name, self._xml_body)

    def _get_text_recurse(self, element, until=None):
        """
        Given an element, returns only the text only nodes in it concatenated
        together, up until the node specified by until is reached.
        """

        cont = True
        text = ""

        if element == until:
            # Check if we need to stop
            cont = False
        elif element.nodeType == element.TEXT_NODE:
            # If it's a text node, add the data, and no more recursion
            text += element.data
        else:
            # depth-first search, recursive step
            for child in element.childNodes:
                (cont, t) = self._get_text_recurse(child, until)
                text += t
                if not cont:
                    break

        return (cont, text)

    def _get_text(self, element, until=None):
        """
        Given an element, returns only the text only nodes in it concatenated
        together, up until the node specified by until is reached.
        """
        return self._get_text_recurse(element, until)[1]

    def _can_align_node_sent(self, node, sent):
        """
        Can this sentence be aligned with this node?
        """

        text = self._get_text(node)
        texti = 0

        # Go through each token and check it can be aligned with somewhere in
        # the text
        for i in range(len(sent)):
            offset = text.find(sent[i][0][0], texti)
            if offset == -1:
                # This token can't be aligned, so we say we can't align, but do
                # say how many tokens were successfully aligned
                return (False, i, texti)
            else:
                texti = offset + len(sent[i][0])

        return (True, i, texti)

    def _split_text_for_S(self, node, sents, s_name, align_point):
        """
        Given a text node, splits it up into sentences and insert these
        sentences in to an appropriate point in the parent node
        """

        # Don't include leading whitespace in the tag
        s_start = node.data.find(sents[0][0][0])
        if s_start > 0:
            node.parentNode.insertBefore(self._xml_doc.createTextNode(node.data[:s_start]), node)

        # Create an S tag containing the matched part
        s_tag = self._xml_doc.createElement(s_name)
        s_tag.appendChild(self._xml_doc.createTextNode(node.data[s_start:align_point]))

        # Insert this where this match tag is
        node.parentNode.insertBefore(s_tag, node)

        # If there's still some text left, then create a new text node with
        # what was left, and then insert that where this text node was, and
        # recurse on it to tag any more sentences, if there are any
        if align_point < len(node.data):
            new_child = self._xml_doc.createTextNode(node.data[align_point:])
            node.parentNode.replaceChild(new_child, node)
            if len(sents) > 1:
                (can_align, tok_aligned, text_aligned) = self._can_align_node_sent(new_child, sents[1])
                if can_align:
                    return self._split_text_for_S(new_child, sents[1:], s_name, text_aligned)
                else:
                    return sents[1:]
            else:
                return []
        else:
            node.parentNode.removeChild(node)
            return sents[1:]

    def _handle_adding_S_tag(self, node, sent, sents, s_tag, s_name):
        # If this node contains the entirety of this sentence, and isn't a
        # text node, then recurse on it to break it down
        (can_align, tok_aligned, text_aligned) = self._can_align_node_sent(node, sent)
        if can_align and node.nodeType != node.TEXT_NODE:
            if len(sent) == len(sents[0]):
                # Current sent isn't a partial match, continue as per usual
                sents = self._add_S_tags(node, sents, s_name)
                if len(sents) > 0:
                    sent = list(sents[0])
                else:
                    return (sent, [], s_tag)
            else:
                # Add, because if this is a partial match but found a full
                # node, it contains the rest of the sentence. Or it's a tag
                # which spans sentence boundaries. The latter is bad.
                s_tag.appendChild(node)

        elif can_align and node.nodeType == node.TEXT_NODE:
            # If this text node does contain the full sentence so far, then
            # break up that text node and add the text between <s> tags as
            # appropriate
            if len(sent) == len(sents[0]):
                sents = self._split_text_for_S(node, sents, s_name, text_aligned)
                if len(sents) > 0:
                    sent = list(sents[0])
                else:
                    return (sent, [], s_tag)
            else:
                # If we've matched part of a sentence so far, and this
                # text node finishes it off, then break up the text node and
                # add the first bit of it to this node. Then recurse on the
                # rest of it with the remaining sentences
                s_tag.appendChild(self._xml_doc.createTextNode(node.data[:text_aligned]))
                new_child = self._xml_doc.createTextNode(node.data[text_aligned:])
                node.parentNode.replaceChild(new_child, node)
                (can_align, tok_aligned, text_aligned) = self._can_align_node_sent(new_child, sents[1])
                if len(sents) > 1:
                    sent = list(sents[1])
                    sents = sents[1:]
                    if can_align:
                        sents = self._split_text_for_S(new_child, sents, s_name, text_aligned)
                        if len(sents) > 0:
                            sent = list(sents[0])
                        else:
                            return (sent, [], s_tag)
                    else:
                        (sent, sents, s_tag) = self._handle_adding_S_tag(new_child, sent, sents, s_tag, s_name)

        else:
            # What we have didn't match the whole sentence, so just add the
            # entire node and then update how little we have left.
            # If this is the first incomplete match we've found (that is,
            # our partial sentence is the same as the full one), then this
            # is a new sentence
            if len(sent) == len(sents[0]):
                s_tag = self._xml_doc.createElement(s_name)
                node.parentNode.insertBefore(s_tag, node)
                if node.nodeType == node.TEXT_NODE:
                    s_start = node.data.find(sent[0][0])
                    if s_start > 0:
                        s_tag.parentNode.insertBefore(self._xml_doc.createTextNode(node.data[:s_start]), s_tag)
                    new_node = self._xml_doc.createTextNode(node.data[s_start:])
                    node.parentNode.replaceChild(new_node, node)
                    node = new_node
            s_tag.appendChild(node)

            # update our sentence to a partial match
            sent = sent[tok_aligned:]
            return (sent, sents, s_tag)

        return (sent, sents, s_tag)

    def _add_S_tags(self, node, sents, s_name):
        """
        Given a node, and some sentences, add tags called s_name such that these
        tags denote sentence boundaries. Return any sentences which could not
        be assigned in this node.
        """

        # Base case
        if len(sents) > 0:
            sent = list(sents[0])
        else:
            return []

        s_tag = None
        for child in list(node.childNodes):
            (sent, sents, s_tag) = self._handle_adding_S_tag(child, sent, sents, s_tag, s_name)

        return sents

    def _add_LEX_tags(self, node, sent, LEX_name):
        """
        Given a node and a sentence, enclose the tokens in that sentence with
        tags called LEX_name to mark token boundaries.
        """

        if len(sent) > 0:
            # Drill down until we reach a text node, then align tokens so far in
            # that text node.
            if node.nodeType == node.TEXT_NODE:
                tok = sent[0][0]
                text = self._get_text(node)
                start = text.find(tok[0])

                # Include any whitespace
                if start == -1:
                    # Could not align in this node, so continue
                    return sent
                elif start > 0:
                    before = self._xml_doc.createTextNode(text[:start])
                    node.parentNode.insertBefore(before, node)

                # Now create the LEX tag
                lex_tag = self._xml_doc.createElement(LEX_name)
                lex_tag.appendChild(self._xml_doc.createTextNode(text[start:start + len(tok)]))
                node.parentNode.insertBefore(lex_tag, node)

                # Replace the text node with the list tail
                new_text = self._xml_doc.createTextNode(text[start + len(tok):])
                node.parentNode.replaceChild(new_text, node)

                # Continue adding for the rest of this LEX node
                sent = sent[1:]
                return self._add_LEX_tags(new_text, sent, LEX_name)

            else:
                for child in list(node.childNodes):
                    sent = self._add_LEX_tags(child, sent, LEX_name)

        return sent

    def _get_token_extent(self, node, sent):
        if node.nodeType == node.TEXT_NODE:
            i = 0
            texti = 0
            text = node.data
            for (tok, pos, ts) in sent:
                offset = text.find(tok[0], texti)
                if offset < 0:
                    return i
                else:
                    i += 1
                    texti = offset + len(tok)
        else:
            i = 0
            for child in node.childNodes:
                extent = self._get_token_extent(child, sent)
                sent = sent[extent:]
                i += extent
        return i

    def _add_timex_child(self, timex, sent, node, start, end):
        i = 0
        timex_tag = None
        for child in list(node.childNodes):
            e = self._get_token_extent(child, sent[i:])
            if (i + e) >= start and i <= start and e > 0:
                if child.nodeType == node.TEXT_NODE:
                    # get length of bit before TIMEX
                    texti = 0
                    for (tok, pos, ts) in sent[i:start]:
                        offset = child.data.find(tok[0], texti)
                        if offset == -1:
                            raise TokeniseError('INTERNAL ERROR: Could not align timex start')
                        texti = offset + len(tok)
                        # Now whitespace before first token
                    texti = child.data.find(sent[start][0][0], texti)
                    if texti == -1:
                        # The start of the TIMEX isn't in this text node
                        texti = len(child.data)

                    timex_tag = self._xml_doc.createElement(self._timex_tag_name)
                    self._annotate_node_from_timex(timex, timex_tag)

                    # Found our split point, so now create two nodes
                    before_text = self._xml_doc.createTextNode(child.data[:texti])
                    new_child = self._xml_doc.createTextNode(child.data[texti:])
                    node.insertBefore(before_text, child)
                    node.insertBefore(timex_tag, child)
                    node.replaceChild(new_child, child)
                    child = new_child
                    i += self._get_token_extent(before_text, sent[i:])
                    e = self._get_token_extent(child, sent[i:])

            # This node is completely covered by this TIMEX, so include it
            # inside the TIMEX, unless the timex is non consuming
            if (i + e) <= end and i >= start and not timex.non_consuming and (
            e > 0 or (i > start and (i + e) < end)) and (child.nodeType != node.TEXT_NODE or (i + e) < end):
                if timex_tag is None:
                    timex_tag = self._xml_doc.createElement(self._timex_tag_name)
                    self._annotate_node_from_timex(timex, timex_tag)
                    node.insertBefore(timex_tag, child)
                timex_tag.appendChild(child)

            if ((i + e) > end and i < end and not timex.non_consuming) or\
               ((
                i + e) == end and i >= start and not timex.non_consuming and e > 0 and child.nodeType == node.TEXT_NODE):
                # This crosses the end boundary, so if our TIMEX consumes text
                # then split the node in half (if it's a text node)
                if child.nodeType == node.TEXT_NODE:
                    texti = 0
                    for (tok, pos, ts) in sent[i:end]:
                        offset = child.data.find(tok[0], texti)
                        if offset == -1:
                            raise TokeniseError('INTERNAL ERROR: Could not align timex end ' + tok + ' ' + child.data)
                        texti = offset + len(tok)

                    # Found our split point, so now create two nodes
                    new_child = self._xml_doc.createTextNode(child.data[texti:])
                    timex_tag.appendChild(self._xml_doc.createTextNode(child.data[:texti]))
                    node.replaceChild(new_child, child)

                else:
                    raise NestingError('Can not tag TIMEX (' + str(timex) + ') without causing invalid XML nesting')

            i += e

    def _add_timex(self, timex, sent, s_node):
        # Find start:end indices for this TIMEX
        start = 0
        end = 0
        t_reached = False
        for (tok, pos, ts) in sent:
            if timex not in ts and not t_reached:
                start += 1
                end += 1
            if timex in ts:
                t_reached = True
                end += 1

        start_extent = 0
        for child in list(s_node.childNodes):
            extent = self._get_token_extent(child, sent[start_extent:])
            end_extent = start_extent + extent
            if start_extent <= start and end_extent >= end:
                # This child can completely contain the TIMEX, so recurse on it
                # unless it's a text node
                if child.nodeType == child.TEXT_NODE:
                    self._add_timex_child(timex, sent, s_node, start, end)
                    break
                else:
                    self._add_timex(timex, sent[start_extent:end_extent], child)
                    break
            elif start_extent < start and end_extent < end - 1 and end_extent >= start:
                # This child contains the start of the TIMEX, but can't
                # completely hold it, which must mean the parent node is the
                # highest node which contains the TIMEX
                self._add_timex_child(timex, sent, s_node, start, end)
                break
            start_extent = end_extent

[docs]    def reconcile(self, sents, add_S=False, add_LEX=False, pos_attr=False):
        """
        Reconciles this document against the new internal representation. If
        add_S is set to anything other than False, this means tags are indicated
        to indicate the sentence boundaries, with the tag names being the value
        of add_S. add_LEX is the same, but for marking token boundaries, and
        pos_attr is the name of the attribute which holds the POS tag for that
        token. This is mainly useful for transforming the TERN documents into
        something that GUTime can parse.
        
        If your document already contains S and LEX tags, and add_S/add_LEX is
        set to add them, old S/LEX tags will be stripped first. If pos_attr is
        set and the attribute name differs from the old POS attribute name on
        the lex tag, then the old attribute will be removed.
        
        Sentence/token boundaries will not be altered in the final document
        unless add_S/add_LEX is set. If you have changed the token boundaries in
        the internal representation from the original form, but are not then
        adding them back in, reconciliation may give undefined results.
        
        There are some inputs which would output invalid XML. For example, if
        this document has elements which span multiple sentences, but not whole
        parts of them, then you will be unable to add XML tags and get valid
        XML, so failure will occur in unexpected ways.
        
        If you are adding LEX tags, and your XML document contains tags internal
        to tokens, then reconciliation will fail, as it expects tokens to be in
        a continuous piece of whitespace.
        """

        # First, add S tags if need be.
        if add_S:
            # First, strip any old ones
            if self._has_S:
                self._strip_tags(self._xml_doc, self._has_S, self._xml_body)

            # Then add the new ones
            leftover = self._add_S_tags(self._xml_body, sents, add_S)
            if len(leftover) > 1:
                raise NestingError('Unable to add all S tags, possibly due to bad tag nesting' + str(leftover))

            # Update what we consider to be our S tags
            self._has_S = add_S

        # Now, get a list of the S nodes, which are used to reconcile individual
        # tokens
        if self._has_S:
            s_nodes = self._xml_body.getElementsByTagName(self._has_S)
        else:
            # There are no S tokens in the text. So, going forward, only
            # consider there being one sentence, which belongs to the root node
            s_nodes = [self._xml_body]
            new_sent = []
            for sent in sents:
                for part in sent:
                    new_sent.append(part)
            sents = [new_sent]

        # Now, add LEX tags if need be
        if add_LEX:
            # First, strip any old ones
            if self._has_LEX:
                self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body)

            # Now add those LEX tokens
            for i in range(len(sents)):
                self._add_LEX_tags(s_nodes[i], sents[i], add_LEX)

            # Update what we consider to be our LEX tags
            self._has_LEX = add_LEX

        # Now, add the POS attribute
        if pos_attr and self._has_LEX:
            # Get each LEX tag and add the attribute
            for i in range(len(sents)):
                lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX)
                for j in range(len(sents[i])):
                    # Strip the existing attribute if need be
                    try:
                        lex_tags[j].removeAttribute(self._pos_attr)
                    except xml.dom.NotFoundErr:
                        pass

                    # Now set the new POS attr
                    lex_tags[j].setAttribute(pos_attr, sents[i][j][1])

            # Update what we think is the pos attr
            self._pos_attr = pos_attr

        # Strip old TIMEXes to avoid duplicates
        self.strip_timexes()

        # For XML documents, TIMEXes need unique IDs
        all_ts = set()
        for sent in sents:
            for (tok, pos, ts) in sent:
                for t in ts:
                    all_ts.add(t)
        add_timex_ids(all_ts)

        # Now iterate over each sentence
        for i in range(len(sents)):
            # Get all timexes in this sentence
            timexes = set()
            for (word, pos, ts) in sents[i]:
                for t in ts:
                    timexes.add(t)

            # Now, for each timex, add it to the sentence
            for timex in timexes:
                try:
                    self._add_timex(timex, sents[i], s_nodes[i])
                except NestingError as e:
                    LOGGER.exception("Error whilst attempting to add TIMEX")

    def _nodes_to_sents(self, node, done_sents, nondone_sents, senti):
        """
        Given a node (which spans multiple sentences), a list of sentences which
        have nodes assigned, and those which don't currently have nodes assigned
        """

        # Get next not done sent
        (sent, snodes) = nondone_sents[0]

        # Align start of node with where we care about
        text = self._get_text(node)
        text = text[text.find(sent[senti]):]

        if len(text) > len(sent) - senti and node.nodeType != node.TEXT_NODE:
            # This node is longer than what's remaining in our sentence, so
            # try and find a small enough piece.
            for child in node.childNodes:
                (done_sents, nondone_sents, senti) = self._nodes_to_sents(child, done_sents, nondone_sents, senti)

        elif len(text) > len(sent) - senti and node.nodeType == node.TEXT_NODE:
            # It's a text node! Append the relevant part of this text node to
            # this sent
            snodes.append(self._xml_doc.createTextNode(text[:len(sent) - senti]))

            # Mark this sentence as done, yay!
            done_sents.append(nondone_sents[0])
            nondone_sents = nondone_sents[1:]

            # Now recurse on the next text node
            if nondone_sents:
                (done_sents, nondone_sents, senti) = self._nodes_to_sents(
                    self._xml_doc.createTextNode(text[len(sent) - senti:]), done_sents, nondone_sents, 0)
            else:
                senti = 0

        else:
            # This node is shorter or the same length as what's left in this
            # sentence! So we can just add this node
            snodes.append(node)
            nondone_sents[0] = (sent, snodes)
            senti += len(text)

            # Now, if that sentence is complete, then move it from nondone into
            # done
            if senti == len(sent):
                done_sents.append(nondone_sents[0])
                nondone_sents = nondone_sents[1:]
                senti = 0

        return (done_sents, nondone_sents, senti)

    def _timex_node_token_align(self, text, sent, tokeni):
        """
        Given a tokenised sentence and some text, with some starting token
        offset, figure out which is the token after the last token in this
        block of text
        """
        texti = 0
        for (token, pos, timexes) in sent[tokeni:]:
            text_offset = text[texti:].find(sent[tokeni][0][0])
            if text_offset == -1:
                # can't align with what's left, so next token must be a boundary
                break
            else:
                # Move our text point along to the end of the current token,
                # and continue
                texti += text_offset + len(token)
                tokeni += 1

        return tokeni

[docs]    def get_sents(self):
        """
        Returns a representation of this document in the
        [[(word, pos, timexes), ...], ...] format.
        
        If there are any TIMEXes in the input document that cross sentence
        boundaries (and the input is not already broken up into sentences with
        the S tag), then those TIMEXes are disregarded.
        """

        # Collect all TIMEXs so we can later find those outside of a sentence
        all_timex_nodes = set()
        all_timexes_by_id = dict()
        all_timexes = []

        # Is this pre-tokenised into sentences?
        if self._has_S:
            # easy
            sents = [(self._get_text(sent), sent) for sent in self._xml_body.getElementsByTagName(self._has_S)]
        else:
            # Get the text, sentence tokenise it and then assign the content
            # nodes of a sentence to that sentence. This is used for identifying
            # LEX tags, if any, and TIMEX tags, if any, later.
            (nodesents, ndsents, i) = self._nodes_to_sents(self._xml_body, [],
                [(sent, []) for sent in nltk.tokenize.sent_tokenize(self._get_text(self._xml_body))], 0)
            if len(ndsents) > 0:
                raise TokeniseError('INTERNAL ERROR: there appears to be sentences not assigned to nodes')

            # Combine contents under a 'virtual' S tag
            sents = []
            for (sent, nodes) in nodesents:
                s_node = self._xml_doc.createElement('s')
                sents.append((sent, s_node))
                for node in nodes:
                    # Mark any TIMEX nodes as found before the deep copy
                    if node.nodeType == node.ELEMENT_NODE or node.nodeType == node.DOCUMENT_NODE:
                        for timex_tag in node.getElementsByTagName(self._timex_tag_name):
                            all_timex_nodes.add(timex_tag)
                    if node.nodeType == node.ELEMENT_NODE:
                        if node.tagName == self._timex_tag_name:
                            all_timex_nodes.add(node)

                    # Clone the node to avoid destroying our original document
                    # and add it to our virtual S node
                    s_node.appendChild(node.cloneNode(True))


        # Is this pre-tokenised into tokens?
        if self._has_LEX:
            # Go through each node, and find the LEX tags in there
            tsents = []
            for (sent, s_node) in sents:
                toks = []
                for node in s_node.childNodes:
                    if node.nodeType == node.ELEMENT_NODE and node.tagName == self._has_LEX:
                        # If this is a LEX tag
                        toks.append((self._get_text(node), node))
                    elif node.nodeType == node.ELEMENT_NODE or node.nodeType == node.DOCUMENT_NODE:
                        # get any lex tags which are children of this node
                        # and add them
                        for lex in node.getElementsByTagName(self._has_LEX):
                            toks.append((self._get_text(lex), lex))
                tsents.append((toks, s_node))
        else:
            # Don't need to keep nodes this time, so this is easier than
            # sentence tokenisation
            tsents = [([(tok, None) for tok in nltk.tokenize.word_tokenize(sent)], nodes) for (sent, nodes) in sents]

        # Right, now POS tag. If POS is an attribute on the LEX tag, then just
        # use that
        if self._has_LEX and self._pos_attr:
            psents = [([(tok, tag.getAttribute(self._pos_attr)) for (tok, tag) in sent], nodes) for (sent, nodes) in
                      tsents]
        else:
            # use the NLTK
            psents = [([t for t in nltk.tag.pos_tag([s for (s, a) in sent])], nodes) for (sent, nodes) in tsents]

        # Now do timexes - first get all timex tags in a sent
        txsents = []
        for (sent, s_node) in psents:
            txsent = [(t, pos, set()) for (t, pos) in sent]

            # Get all timexes in this sentence
            timex_nodes = s_node.getElementsByTagName(self._timex_tag_name)

            # Now, for each timex tag, create a timex object to
            # represent it
            for timex_node in timex_nodes:
                all_timex_nodes.add(timex_node)
                timex = self._timex_from_node(timex_node)

                # Record a reference to it for resolution of attributes which
                # refer to other references later
                all_timexes_by_id[timex.id] = timex
                all_timexes.append(timex)

                # Now figure out the extent of it
                timex_body = self._get_text(timex_node)
                timex_before = self._get_text(s_node, timex_node)

                # Go through each part of the before text and find the
                # first token in the body of the timex
                tokeni = self._timex_node_token_align(timex_before, txsent, 0)

                # Now we have the start token, find the end token from
                # the body of the timex
                tokenj = self._timex_node_token_align(timex_body, txsent, tokeni)

                # Handle non-consuming TIMEXes
                if tokeni == tokenj:
                    timex.non_consuming = True
                    txsent[tokeni][2].add(timex)
                else:
                    # Okay, now add this timex to the relevant tokens
                    for (tok, pos, timexes) in txsent[tokeni:tokenj]:
                        timexes.add(timex)

            txsents.append(txsent)

        # Now get all TIMEX tags which are not inside <s> tags (and assume
        # they're non-consuming)
        for timex_node in self._xml_body.getElementsByTagName(self._timex_tag_name):
            if timex_node not in all_timex_nodes:
                # Found a TIMEX that has not been seen before
                all_timex_nodes.add(timex_node)
                timex = self._timex_from_node(timex_node)
                all_timexes_by_id[timex.id] = timex
                all_timexes.append(timex)

                # Assume it's non-consuming
                timex.non_consuming = True

                # And just add it at the front
                txsents[0][0][2].add(timex)

        # Now resolve any dangling references
        for timex in all_timexes:
            if timex.begin_timex != None:
                timex.begin_timex = all_timexes_by_id[timex.begin_timex]
            if timex.end_timex != None:
                timex.end_timex = all_timexes_by_id[timex.end_timex]
            if timex.context != None:
                timex.context = all_timexes_by_id[timex.context]

        return txsents

    def __str__(self):
        """
        String representation of this document
        """
        return self._xml_doc.toxml()

[docs]    def get_dct_sents(self):
        """
        Returns the creation time sents for this document.
        """
        return []

[docs]    def reconcile_dct(self, dct, add_S=False, add_LEX=False, pos_attr=False):
        """
        Adds a TIMEX to the DCT tag and return the DCT
        """
        pass


[docs]class TokeniseError(Exception):
    def __init__(self, s):
        self._s = s

    def __str__(self):
        return str(self._s)


[docs]class NestingError(Exception):
    def __init__(self, s):
        self._s = s

    def __str__(self):
        return str(self._s)


[docs]class BadNodeNameError(Exception):
    def __str__(self):
        return "The specified tag name does not exist exactly once in the document"