/ternip/formats/tempeval2.py

https://github.com/cnorthwood/ternip
Python | 178 lines | 93 code | 47 blank | 38 comment | 30 complexity | 18bec73cc263cacbec09d9ec5b835d60 MD5 | raw file
  1. #!/usr/bin/env python
  2. from collections import defaultdict
  3. import copy
  4. import nltk.tag
  5. from ternip.timex import add_timex_ids
  6. class TempEval2Document(object):
  7. """
  8. A class which uses the format of stand-off format of TempEval-2
  9. """
  10. @staticmethod
  11. def create(sents, docid=''):
  12. """
  13. Creates a TempEval-2 document from the internal representation
  14. sents is the [[(word, pos, timexes), ...], ...] format.
  15. """
  16. # Create a blank document
  17. d = TempEval2Document('', docid)
  18. # Add sents
  19. d.reconcile(sents)
  20. return d
  21. @staticmethod
  22. def load_multi(file, dct_file):
  23. """
  24. Load multiple documents from a single base-segmentation.tab
  25. """
  26. ds = defaultdict(list)
  27. dcts = defaultdict(str)
  28. for line in dct_file.splitlines():
  29. parts = line.split('\t')
  30. dcts[parts[0]] = parts[1]
  31. for line in file.splitlines():
  32. parts = line.split('\t')
  33. ds[parts[0]].append(line)
  34. docs = []
  35. for d in ds:
  36. docs.append(TempEval2Document('\n'.join(ds[d]), d, dcts[d]))
  37. return docs
  38. def __init__(self, file, docid='', dct='XXXXXXXX'):
  39. """
  40. Load a document
  41. """
  42. tok_sents = []
  43. self.docid = docid
  44. for line in file.splitlines():
  45. parts = line.split('\t')
  46. if len(parts) > 3:
  47. i = int(parts[1])
  48. j = int(parts[2])
  49. if len(tok_sents) <= i:
  50. tok_sents.insert(i, [])
  51. tok_sents[i].insert(j, parts[3])
  52. self._sents = [[(tok, pos, set()) for (tok, pos) in nltk.tag.pos_tag(tok_sent)] for tok_sent in tok_sents]
  53. self.dct = dct
  54. def get_sents(self):
  55. """
  56. Returns a representation of this document in the
  57. [[(word, pos, timexes), ...], ...] format.
  58. """
  59. return copy.deepcopy(self._sents)
  60. def get_dct_sents(self):
  61. """
  62. Returns the creation time sents for this document.
  63. """
  64. return [[(self.dct, 'DCT', set())]]
  65. def reconcile_dct(self, dct):
  66. """
  67. Adds a TIMEX to the DCT tag and return the DCT
  68. """
  69. pass
  70. def reconcile(self, sents):
  71. """
  72. Update this document with the newly annotated tokens.
  73. """
  74. self._sents = copy.deepcopy(sents)
  75. def _get_timex_line(self, i, j, timex):
  76. return self.docid + "\t" + str(i) + "\t" + str(j) + "\ttimex3\tt" + str(timex.id) + "\t1"
  77. def get_extents(self):
  78. """
  79. Print out the format suitable for timex-extents.tab
  80. """
  81. # TIMEXes need unique IDs
  82. all_ts = set()
  83. for sent in self._sents:
  84. for (tok, pos, ts) in sent:
  85. for t in ts:
  86. all_ts.add(t)
  87. add_timex_ids(all_ts)
  88. s = ""
  89. for i in range(len(self._sents)):
  90. for j in range(len(self._sents[i])):
  91. for timex in self._sents[i][j][2]:
  92. s += self._get_timex_line(i, j, timex) + "\n"
  93. return s
  94. def get_attrs(self):
  95. """
  96. Print out the format suitable for timex-attributes.tab
  97. """
  98. s = ''
  99. timexes_done = set()
  100. for i in range(len(self._sents)):
  101. for j in range(len(self._sents[i])):
  102. for timex in self._sents[i][j][2]:
  103. # Only need to print attributes once
  104. if timex in timexes_done:
  105. continue
  106. else:
  107. timexes_done.add(timex)
  108. if timex.value is not None:
  109. s += self._get_timex_line(i, j, timex) + "\tvalue\t" + timex.value + "\n"
  110. if timex.mod is not None:
  111. s += self._get_timex_line(i, j, timex) + "\tmod\t" + timex.mod + "\n"
  112. if timex.type is not None:
  113. s += self._get_timex_line(i, j, timex) + "\ttype\t" + timex.type.upper() + "\n"
  114. if timex.freq is not None:
  115. s += self._get_timex_line(i, j, timex) + "\tfreq\t" + timex.freq + "\n"
  116. if timex.comment is not None:
  117. s += self._get_timex_line(i, j, timex) + "\tcomment\t" + timex.comment + "\n"
  118. if timex.quant is not None:
  119. s += self._get_timex_line(i, j, timex) + "\tquant\t" + timex.quant + "\n"
  120. if timex.temporal_function:
  121. s += self._get_timex_line(i, j, timex) + "\ttemporalFunction\ttrue\n"
  122. if timex.document_role is not None:
  123. s += self._get_timex_line(i, j, timex) + "\tfunctionInDocument\t" + timex.document_role + "\n"
  124. if timex.begin_timex is not None:
  125. s += self._get_timex_line(i, j, timex) + "\tbeginPoint\tt" + str(timex.begin_timex.id) + "\n"
  126. if timex.end_timex is not None:
  127. s += self._get_timex_line(i, j, timex) + "\tendPoint\tt" + str(timex.end_timex.id) + "\n"
  128. if timex.context is not None:
  129. s += self._get_timex_line(i, j, timex) + "\tanchorTimeID\tt" + str(timex.context.id) + "\n"
  130. return s