Source code for ternip.formats.xml_doc

import xml.dom.minidom
import logging

import nltk.tag
import nltk.tokenize

from ternip.timex import add_timex_ids

LOGGER = logging.getLogger(__name__)

[docs]class XmlDocument(object): """ An abstract base class which all XML types can inherit from. This implements almost everything, apart from the conversion of timex objects to and from timex tags in the XML. This is done by child classes """ @staticmethod def _add_words_to_node_from_sents(doc, node, sents, tok_offsets=None): """ Uses the given node and adds an XML form of sents to it. The node passed in should have no children (be an empty element) """ # Just add text here, then leave it up to reconcile to add all other # tags s_offset = 0 for i in range(len(sents)): for j in range(len(sents[i])): (tok, pos, ts) = sents[i][j] # Do we know what token offsets are in order to reinstate them? if tok_offsets is not None: # Add whitespace between tokens if needed while s_offset < tok_offsets[i][j]: node.appendChild(doc.createTextNode(' ')) s_offset += 1 # Add the text node.appendChild(doc.createTextNode(tok)) # If we're not using token offsets, assume a single space is # what's used, except if this is the last element. if tok_offsets is None: if not (i == len(sents) - 1 and j == len(sents[i]) - 1): node.appendChild(doc.createTextNode(' ')) else: # Increase our current sentence offset s_offset += len(tok) node.normalize() return node @staticmethod
[docs] def create(sents, tok_offsets=None, add_S=False, add_LEX=False, pos_attr=False): """ This is an abstract function for building XML documents from the internal representation only. You are not guaranteed to get out of get_sents what you put in here. Sentences and words will be retokenised and retagged unless you explicitly add S and LEX tags and the POS attribute to the document using the optional arguments. sents is the [[(word, pos, timexes), ...], ...] format. tok_offsets is used to correctly reinsert whitespace lost in tokenisation. It's in the format of a list of lists of integers, where each integer is the offset from the start of the sentence of that token. If set to None (the default), then a single space is assumed between all tokens. If add_S is set to something other than false, then the tags to indicate sentence boundaries are added, with the name of the tag being the value of add_S add_LEX is similar, but for token boundaries pos_attr is similar but refers to the name of the attribute on the LEX (or whatever) tag that holds the POS tag. """ raise NotImplementedError
def __init__(self, file, nodename=None, has_S=False, has_LEX=False, pos_attr=False): """ Passes in an XML document (as one consecutive string) which is used as the basis for this object. Alternatively, you can pass in an xml.dom.Document class which means that it's not parsed. This is used by the static create function. Node name is the name of the "body" of this document to be considered. If set to None (it's default), then the root node is considered to be the document body. has_S means that the document uses XML tags to mark sentence boundaries. This defaults to False, but if your XML document does, you should set it to the name of your sentence boundary tag (normally 'S'). has_LEX is similar to has_S, but for token boundaries. Again, set this to your tag for token boundaries (not as common, but sometimes it's 'lex') pos_attr is the name of the attribute on your LEX (or whatever) tags that indicates the POS tag for that token. The tagger needs tokenised sentences and tokenised and POS tagged tokens in order to be able to tag. If the input does not supply this data, the NLTK is used to fill the blanks. If this input is supplied, it is blindly accepted as reasonably sensible. If there are tokens which are not annotated (for whatever reason), then alignment between XML nodes and the results of the tagging may fail and give undesirable results. Similarly, if tokens are embedded inside other tokens, this will also error in likely undesirable way, and such a tagging is likely erroneous. """ if isinstance(file, xml.dom.minidom.Document): self._xml_doc = file else: self._xml_doc = xml.dom.minidom.parseString(file) if nodename is None: self._xml_body = self._xml_doc.documentElement else: tags = self._xml_doc.getElementsByTagName(nodename) if len(tags) != 1: raise BadNodeNameError() self._xml_body = tags[0] self._has_S = has_S self._has_LEX = has_LEX self._pos_attr = pos_attr def _strip_tags(self, doc, tagname, node): """ Recursively remove a tag from this node """ # Recursive step - depth-first search for child in list(node.childNodes): # Get the list of nodes which replace this one (if any) rep = self._strip_tags(doc, tagname, child) if len(rep) == 1: # If it's a single node that's taking the place of this one (e.g., # if there was no change, or a timex tag that only had some text # inside it), but only if the node's changed if rep[0] is not child: node.replaceChild(rep[0], child) node.normalize() else: # There were multiple child nodes, need to insert all of them # where in the same location, in order, where their parent # node was. Unfortunately replaceChild can't do replacement # of a node with multiple nodes. before = child.nextSibling node.removeChild(child) for new_node in rep: node.insertBefore(new_node, before) node.normalize() # Base step if node.nodeType == node.ELEMENT_NODE and node.tagName == tagname: return [child for child in node.childNodes] else: return [node]
[docs] def strip_tag(self, tagname): """ Remove this tag from the document. """ self._strip_tags(self._xml_doc, tagname, self._xml_body)
[docs] def strip_timexes(self): """ Strips all timexes from this document. Useful if we're evaluating the software - we can just feed in the gold standard directly and compare the output then. """ self._strip_tags(self._xml_doc, self._timex_tag_name, self._xml_body)
def _get_text_recurse(self, element, until=None): """ Given an element, returns only the text only nodes in it concatenated together, up until the node specified by until is reached. """ cont = True text = "" if element == until: # Check if we need to stop cont = False elif element.nodeType == element.TEXT_NODE: # If it's a text node, add the data, and no more recursion text += element.data else: # depth-first search, recursive step for child in element.childNodes: (cont, t) = self._get_text_recurse(child, until) text += t if not cont: break return (cont, text) def _get_text(self, element, until=None): """ Given an element, returns only the text only nodes in it concatenated together, up until the node specified by until is reached. """ return self._get_text_recurse(element, until)[1] def _can_align_node_sent(self, node, sent): """ Can this sentence be aligned with this node? """ text = self._get_text(node) texti = 0 # Go through each token and check it can be aligned with somewhere in # the text for i in range(len(sent)): offset = text.find(sent[i][0][0], texti) if offset == -1: # This token can't be aligned, so we say we can't align, but do # say how many tokens were successfully aligned return (False, i, texti) else: texti = offset + len(sent[i][0]) return (True, i, texti) def _split_text_for_S(self, node, sents, s_name, align_point): """ Given a text node, splits it up into sentences and insert these sentences in to an appropriate point in the parent node """ # Don't include leading whitespace in the tag s_start = node.data.find(sents[0][0][0]) if s_start > 0: node.parentNode.insertBefore(self._xml_doc.createTextNode(node.data[:s_start]), node) # Create an S tag containing the matched part s_tag = self._xml_doc.createElement(s_name) s_tag.appendChild(self._xml_doc.createTextNode(node.data[s_start:align_point])) # Insert this where this match tag is node.parentNode.insertBefore(s_tag, node) # If there's still some text left, then create a new text node with # what was left, and then insert that where this text node was, and # recurse on it to tag any more sentences, if there are any if align_point < len(node.data): new_child = self._xml_doc.createTextNode(node.data[align_point:]) node.parentNode.replaceChild(new_child, node) if len(sents) > 1: (can_align, tok_aligned, text_aligned) = self._can_align_node_sent(new_child, sents[1]) if can_align: return self._split_text_for_S(new_child, sents[1:], s_name, text_aligned) else: return sents[1:] else: return [] else: node.parentNode.removeChild(node) return sents[1:] def _handle_adding_S_tag(self, node, sent, sents, s_tag, s_name): # If this node contains the entirety of this sentence, and isn't a # text node, then recurse on it to break it down (can_align, tok_aligned, text_aligned) = self._can_align_node_sent(node, sent) if can_align and node.nodeType != node.TEXT_NODE: if len(sent) == len(sents[0]): # Current sent isn't a partial match, continue as per usual sents = self._add_S_tags(node, sents, s_name) if len(sents) > 0: sent = list(sents[0]) else: return (sent, [], s_tag) else: # Add, because if this is a partial match but found a full # node, it contains the rest of the sentence. Or it's a tag # which spans sentence boundaries. The latter is bad. s_tag.appendChild(node) elif can_align and node.nodeType == node.TEXT_NODE: # If this text node does contain the full sentence so far, then # break up that text node and add the text between <s> tags as # appropriate if len(sent) == len(sents[0]): sents = self._split_text_for_S(node, sents, s_name, text_aligned) if len(sents) > 0: sent = list(sents[0]) else: return (sent, [], s_tag) else: # If we've matched part of a sentence so far, and this # text node finishes it off, then break up the text node and # add the first bit of it to this node. Then recurse on the # rest of it with the remaining sentences s_tag.appendChild(self._xml_doc.createTextNode(node.data[:text_aligned])) new_child = self._xml_doc.createTextNode(node.data[text_aligned:]) node.parentNode.replaceChild(new_child, node) (can_align, tok_aligned, text_aligned) = self._can_align_node_sent(new_child, sents[1]) if len(sents) > 1: sent = list(sents[1]) sents = sents[1:] if can_align: sents = self._split_text_for_S(new_child, sents, s_name, text_aligned) if len(sents) > 0: sent = list(sents[0]) else: return (sent, [], s_tag) else: (sent, sents, s_tag) = self._handle_adding_S_tag(new_child, sent, sents, s_tag, s_name) else: # What we have didn't match the whole sentence, so just add the # entire node and then update how little we have left. # If this is the first incomplete match we've found (that is, # our partial sentence is the same as the full one), then this # is a new sentence if len(sent) == len(sents[0]): s_tag = self._xml_doc.createElement(s_name) node.parentNode.insertBefore(s_tag, node) if node.nodeType == node.TEXT_NODE: s_start = node.data.find(sent[0][0]) if s_start > 0: s_tag.parentNode.insertBefore(self._xml_doc.createTextNode(node.data[:s_start]), s_tag) new_node = self._xml_doc.createTextNode(node.data[s_start:]) node.parentNode.replaceChild(new_node, node) node = new_node s_tag.appendChild(node) # update our sentence to a partial match sent = sent[tok_aligned:] return (sent, sents, s_tag) return (sent, sents, s_tag) def _add_S_tags(self, node, sents, s_name): """ Given a node, and some sentences, add tags called s_name such that these tags denote sentence boundaries. Return any sentences which could not be assigned in this node. """ # Base case if len(sents) > 0: sent = list(sents[0]) else: return [] s_tag = None for child in list(node.childNodes): (sent, sents, s_tag) = self._handle_adding_S_tag(child, sent, sents, s_tag, s_name) return sents def _add_LEX_tags(self, node, sent, LEX_name): """ Given a node and a sentence, enclose the tokens in that sentence with tags called LEX_name to mark token boundaries. """ if len(sent) > 0: # Drill down until we reach a text node, then align tokens so far in # that text node. if node.nodeType == node.TEXT_NODE: tok = sent[0][0] text = self._get_text(node) start = text.find(tok[0]) # Include any whitespace if start == -1: # Could not align in this node, so continue return sent elif start > 0: before = self._xml_doc.createTextNode(text[:start]) node.parentNode.insertBefore(before, node) # Now create the LEX tag lex_tag = self._xml_doc.createElement(LEX_name) lex_tag.appendChild(self._xml_doc.createTextNode(text[start:start + len(tok)])) node.parentNode.insertBefore(lex_tag, node) # Replace the text node with the list tail new_text = self._xml_doc.createTextNode(text[start + len(tok):]) node.parentNode.replaceChild(new_text, node) # Continue adding for the rest of this LEX node sent = sent[1:] return self._add_LEX_tags(new_text, sent, LEX_name) else: for child in list(node.childNodes): sent = self._add_LEX_tags(child, sent, LEX_name) return sent def _get_token_extent(self, node, sent): if node.nodeType == node.TEXT_NODE: i = 0 texti = 0 text = node.data for (tok, pos, ts) in sent: offset = text.find(tok[0], texti) if offset < 0: return i else: i += 1 texti = offset + len(tok) else: i = 0 for child in node.childNodes: extent = self._get_token_extent(child, sent) sent = sent[extent:] i += extent return i def _add_timex_child(self, timex, sent, node, start, end): i = 0 timex_tag = None for child in list(node.childNodes): e = self._get_token_extent(child, sent[i:]) if (i + e) >= start and i <= start and e > 0: if child.nodeType == node.TEXT_NODE: # get length of bit before TIMEX texti = 0 for (tok, pos, ts) in sent[i:start]: offset = child.data.find(tok[0], texti) if offset == -1: raise TokeniseError('INTERNAL ERROR: Could not align timex start') texti = offset + len(tok) # Now whitespace before first token texti = child.data.find(sent[start][0][0], texti) if texti == -1: # The start of the TIMEX isn't in this text node texti = len(child.data) timex_tag = self._xml_doc.createElement(self._timex_tag_name) self._annotate_node_from_timex(timex, timex_tag) # Found our split point, so now create two nodes before_text = self._xml_doc.createTextNode(child.data[:texti]) new_child = self._xml_doc.createTextNode(child.data[texti:]) node.insertBefore(before_text, child) node.insertBefore(timex_tag, child) node.replaceChild(new_child, child) child = new_child i += self._get_token_extent(before_text, sent[i:]) e = self._get_token_extent(child, sent[i:]) # This node is completely covered by this TIMEX, so include it # inside the TIMEX, unless the timex is non consuming if (i + e) <= end and i >= start and not timex.non_consuming and ( e > 0 or (i > start and (i + e) < end)) and (child.nodeType != node.TEXT_NODE or (i + e) < end): if timex_tag is None: timex_tag = self._xml_doc.createElement(self._timex_tag_name) self._annotate_node_from_timex(timex, timex_tag) node.insertBefore(timex_tag, child) timex_tag.appendChild(child) if ((i + e) > end and i < end and not timex.non_consuming) or\ (( i + e) == end and i >= start and not timex.non_consuming and e > 0 and child.nodeType == node.TEXT_NODE): # This crosses the end boundary, so if our TIMEX consumes text # then split the node in half (if it's a text node) if child.nodeType == node.TEXT_NODE: texti = 0 for (tok, pos, ts) in sent[i:end]: offset = child.data.find(tok[0], texti) if offset == -1: raise TokeniseError('INTERNAL ERROR: Could not align timex end ' + tok + ' ' + child.data) texti = offset + len(tok) # Found our split point, so now create two nodes new_child = self._xml_doc.createTextNode(child.data[texti:]) timex_tag.appendChild(self._xml_doc.createTextNode(child.data[:texti])) node.replaceChild(new_child, child) else: raise NestingError('Can not tag TIMEX (' + str(timex) + ') without causing invalid XML nesting') i += e def _add_timex(self, timex, sent, s_node): # Find start:end indices for this TIMEX start = 0 end = 0 t_reached = False for (tok, pos, ts) in sent: if timex not in ts and not t_reached: start += 1 end += 1 if timex in ts: t_reached = True end += 1 start_extent = 0 for child in list(s_node.childNodes): extent = self._get_token_extent(child, sent[start_extent:]) end_extent = start_extent + extent if start_extent <= start and end_extent >= end: # This child can completely contain the TIMEX, so recurse on it # unless it's a text node if child.nodeType == child.TEXT_NODE: self._add_timex_child(timex, sent, s_node, start, end) break else: self._add_timex(timex, sent[start_extent:end_extent], child) break elif start_extent < start and end_extent < end - 1 and end_extent >= start: # This child contains the start of the TIMEX, but can't # completely hold it, which must mean the parent node is the # highest node which contains the TIMEX self._add_timex_child(timex, sent, s_node, start, end) break start_extent = end_extent
[docs] def reconcile(self, sents, add_S=False, add_LEX=False, pos_attr=False): """ Reconciles this document against the new internal representation. If add_S is set to anything other than False, this means tags are indicated to indicate the sentence boundaries, with the tag names being the value of add_S. add_LEX is the same, but for marking token boundaries, and pos_attr is the name of the attribute which holds the POS tag for that token. This is mainly useful for transforming the TERN documents into something that GUTime can parse. If your document already contains S and LEX tags, and add_S/add_LEX is set to add them, old S/LEX tags will be stripped first. If pos_attr is set and the attribute name differs from the old POS attribute name on the lex tag, then the old attribute will be removed. Sentence/token boundaries will not be altered in the final document unless add_S/add_LEX is set. If you have changed the token boundaries in the internal representation from the original form, but are not then adding them back in, reconciliation may give undefined results. There are some inputs which would output invalid XML. For example, if this document has elements which span multiple sentences, but not whole parts of them, then you will be unable to add XML tags and get valid XML, so failure will occur in unexpected ways. If you are adding LEX tags, and your XML document contains tags internal to tokens, then reconciliation will fail, as it expects tokens to be in a continuous piece of whitespace. """ # First, add S tags if need be. if add_S: # First, strip any old ones if self._has_S: self._strip_tags(self._xml_doc, self._has_S, self._xml_body) # Then add the new ones leftover = self._add_S_tags(self._xml_body, sents, add_S) if len(leftover) > 1: raise NestingError('Unable to add all S tags, possibly due to bad tag nesting' + str(leftover)) # Update what we consider to be our S tags self._has_S = add_S # Now, get a list of the S nodes, which are used to reconcile individual # tokens if self._has_S: s_nodes = self._xml_body.getElementsByTagName(self._has_S) else: # There are no S tokens in the text. So, going forward, only # consider there being one sentence, which belongs to the root node s_nodes = [self._xml_body] new_sent = [] for sent in sents: for part in sent: new_sent.append(part) sents = [new_sent] # Now, add LEX tags if need be if add_LEX: # First, strip any old ones if self._has_LEX: self._strip_tags(self._xml_doc, self._has_LEX, self._xml_body) # Now add those LEX tokens for i in range(len(sents)): self._add_LEX_tags(s_nodes[i], sents[i], add_LEX) # Update what we consider to be our LEX tags self._has_LEX = add_LEX # Now, add the POS attribute if pos_attr and self._has_LEX: # Get each LEX tag and add the attribute for i in range(len(sents)): lex_tags = s_nodes[i].getElementsByTagName(self._has_LEX) for j in range(len(sents[i])): # Strip the existing attribute if need be try: lex_tags[j].removeAttribute(self._pos_attr) except xml.dom.NotFoundErr: pass # Now set the new POS attr lex_tags[j].setAttribute(pos_attr, sents[i][j][1]) # Update what we think is the pos attr self._pos_attr = pos_attr # Strip old TIMEXes to avoid duplicates self.strip_timexes() # For XML documents, TIMEXes need unique IDs all_ts = set() for sent in sents: for (tok, pos, ts) in sent: for t in ts: all_ts.add(t) add_timex_ids(all_ts) # Now iterate over each sentence for i in range(len(sents)): # Get all timexes in this sentence timexes = set() for (word, pos, ts) in sents[i]: for t in ts: timexes.add(t) # Now, for each timex, add it to the sentence for timex in timexes: try: self._add_timex(timex, sents[i], s_nodes[i]) except NestingError as e: LOGGER.exception("Error whilst attempting to add TIMEX")
def _nodes_to_sents(self, node, done_sents, nondone_sents, senti): """ Given a node (which spans multiple sentences), a list of sentences which have nodes assigned, and those which don't currently have nodes assigned """ # Get next not done sent (sent, snodes) = nondone_sents[0] # Align start of node with where we care about text = self._get_text(node) text = text[text.find(sent[senti]):] if len(text) > len(sent) - senti and node.nodeType != node.TEXT_NODE: # This node is longer than what's remaining in our sentence, so # try and find a small enough piece. for child in node.childNodes: (done_sents, nondone_sents, senti) = self._nodes_to_sents(child, done_sents, nondone_sents, senti) elif len(text) > len(sent) - senti and node.nodeType == node.TEXT_NODE: # It's a text node! Append the relevant part of this text node to # this sent snodes.append(self._xml_doc.createTextNode(text[:len(sent) - senti])) # Mark this sentence as done, yay! done_sents.append(nondone_sents[0]) nondone_sents = nondone_sents[1:] # Now recurse on the next text node if nondone_sents: (done_sents, nondone_sents, senti) = self._nodes_to_sents( self._xml_doc.createTextNode(text[len(sent) - senti:]), done_sents, nondone_sents, 0) else: senti = 0 else: # This node is shorter or the same length as what's left in this # sentence! So we can just add this node snodes.append(node) nondone_sents[0] = (sent, snodes) senti += len(text) # Now, if that sentence is complete, then move it from nondone into # done if senti == len(sent): done_sents.append(nondone_sents[0]) nondone_sents = nondone_sents[1:] senti = 0 return (done_sents, nondone_sents, senti) def _timex_node_token_align(self, text, sent, tokeni): """ Given a tokenised sentence and some text, with some starting token offset, figure out which is the token after the last token in this block of text """ texti = 0 for (token, pos, timexes) in sent[tokeni:]: text_offset = text[texti:].find(sent[tokeni][0][0]) if text_offset == -1: # can't align with what's left, so next token must be a boundary break else: # Move our text point along to the end of the current token, # and continue texti += text_offset + len(token) tokeni += 1 return tokeni
[docs] def get_sents(self): """ Returns a representation of this document in the [[(word, pos, timexes), ...], ...] format. If there are any TIMEXes in the input document that cross sentence boundaries (and the input is not already broken up into sentences with the S tag), then those TIMEXes are disregarded. """ # Collect all TIMEXs so we can later find those outside of a sentence all_timex_nodes = set() all_timexes_by_id = dict() all_timexes = [] # Is this pre-tokenised into sentences? if self._has_S: # easy sents = [(self._get_text(sent), sent) for sent in self._xml_body.getElementsByTagName(self._has_S)] else: # Get the text, sentence tokenise it and then assign the content # nodes of a sentence to that sentence. This is used for identifying # LEX tags, if any, and TIMEX tags, if any, later. (nodesents, ndsents, i) = self._nodes_to_sents(self._xml_body, [], [(sent, []) for sent in nltk.tokenize.sent_tokenize(self._get_text(self._xml_body))], 0) if len(ndsents) > 0: raise TokeniseError('INTERNAL ERROR: there appears to be sentences not assigned to nodes') # Combine contents under a 'virtual' S tag sents = [] for (sent, nodes) in nodesents: s_node = self._xml_doc.createElement('s') sents.append((sent, s_node)) for node in nodes: # Mark any TIMEX nodes as found before the deep copy if node.nodeType == node.ELEMENT_NODE or node.nodeType == node.DOCUMENT_NODE: for timex_tag in node.getElementsByTagName(self._timex_tag_name): all_timex_nodes.add(timex_tag) if node.nodeType == node.ELEMENT_NODE: if node.tagName == self._timex_tag_name: all_timex_nodes.add(node) # Clone the node to avoid destroying our original document # and add it to our virtual S node s_node.appendChild(node.cloneNode(True)) # Is this pre-tokenised into tokens? if self._has_LEX: # Go through each node, and find the LEX tags in there tsents = [] for (sent, s_node) in sents: toks = [] for node in s_node.childNodes: if node.nodeType == node.ELEMENT_NODE and node.tagName == self._has_LEX: # If this is a LEX tag toks.append((self._get_text(node), node)) elif node.nodeType == node.ELEMENT_NODE or node.nodeType == node.DOCUMENT_NODE: # get any lex tags which are children of this node # and add them for lex in node.getElementsByTagName(self._has_LEX): toks.append((self._get_text(lex), lex)) tsents.append((toks, s_node)) else: # Don't need to keep nodes this time, so this is easier than # sentence tokenisation tsents = [([(tok, None) for tok in nltk.tokenize.word_tokenize(sent)], nodes) for (sent, nodes) in sents] # Right, now POS tag. If POS is an attribute on the LEX tag, then just # use that if self._has_LEX and self._pos_attr: psents = [([(tok, tag.getAttribute(self._pos_attr)) for (tok, tag) in sent], nodes) for (sent, nodes) in tsents] else: # use the NLTK psents = [([t for t in nltk.tag.pos_tag([s for (s, a) in sent])], nodes) for (sent, nodes) in tsents] # Now do timexes - first get all timex tags in a sent txsents = [] for (sent, s_node) in psents: txsent = [(t, pos, set()) for (t, pos) in sent] # Get all timexes in this sentence timex_nodes = s_node.getElementsByTagName(self._timex_tag_name) # Now, for each timex tag, create a timex object to # represent it for timex_node in timex_nodes: all_timex_nodes.add(timex_node) timex = self._timex_from_node(timex_node) # Record a reference to it for resolution of attributes which # refer to other references later all_timexes_by_id[timex.id] = timex all_timexes.append(timex) # Now figure out the extent of it timex_body = self._get_text(timex_node) timex_before = self._get_text(s_node, timex_node) # Go through each part of the before text and find the # first token in the body of the timex tokeni = self._timex_node_token_align(timex_before, txsent, 0) # Now we have the start token, find the end token from # the body of the timex tokenj = self._timex_node_token_align(timex_body, txsent, tokeni) # Handle non-consuming TIMEXes if tokeni == tokenj: timex.non_consuming = True txsent[tokeni][2].add(timex) else: # Okay, now add this timex to the relevant tokens for (tok, pos, timexes) in txsent[tokeni:tokenj]: timexes.add(timex) txsents.append(txsent) # Now get all TIMEX tags which are not inside <s> tags (and assume # they're non-consuming) for timex_node in self._xml_body.getElementsByTagName(self._timex_tag_name): if timex_node not in all_timex_nodes: # Found a TIMEX that has not been seen before all_timex_nodes.add(timex_node) timex = self._timex_from_node(timex_node) all_timexes_by_id[timex.id] = timex all_timexes.append(timex) # Assume it's non-consuming timex.non_consuming = True # And just add it at the front txsents[0][0][2].add(timex) # Now resolve any dangling references for timex in all_timexes: if timex.begin_timex != None: timex.begin_timex = all_timexes_by_id[timex.begin_timex] if timex.end_timex != None: timex.end_timex = all_timexes_by_id[timex.end_timex] if timex.context != None: timex.context = all_timexes_by_id[timex.context] return txsents
def __str__(self): """ String representation of this document """ return self._xml_doc.toxml()
[docs] def get_dct_sents(self): """ Returns the creation time sents for this document. """ return []
[docs] def reconcile_dct(self, dct, add_S=False, add_LEX=False, pos_attr=False): """ Adds a TIMEX to the DCT tag and return the DCT """ pass
[docs]class TokeniseError(Exception): def __init__(self, s): self._s = s def __str__(self): return str(self._s)
[docs]class NestingError(Exception): def __init__(self, s): self._s = s def __str__(self): return str(self._s)
[docs]class BadNodeNameError(Exception): def __str__(self): return "The specified tag name does not exist exactly once in the document"