PageRenderTime 49ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/nltk_contrib/nltk_contrib/textgrid.py

http://nltk.googlecode.com/
Python | 653 lines | 562 code | 5 blank | 86 comment | 1 complexity | 37522ac4b7b58e12160790e6d88560db MD5 | raw file
Possible License(s): Apache-2.0, AGPL-1.0
  1. # Natural Language Toolkit: TextGrid analysis
  2. #
  3. # Copyright (C) 2001-2011 NLTK Project
  4. # Author: Margaret Mitchell <itallow@gmail.com>
  5. # Steven Bird <sb@csse.unimelb.edu.au> (revisions)
  6. # URL: <http://www.nltk.org>
  7. # For license information, see LICENSE.TXT
  8. #
  9. """
  10. Tools for reading TextGrid files, the format used by Praat.
  11. Module contents
  12. ===============
  13. The textgrid corpus reader provides 4 data items and 1 function
  14. for each textgrid file. For each tier in the file, the reader
  15. provides 10 data items and 2 functions.
  16. For the full textgrid file:
  17. - size
  18. The number of tiers in the file.
  19. - xmin
  20. First marked time of the file.
  21. - xmax
  22. Last marked time of the file.
  23. - t_time
  24. xmax - xmin.
  25. - text_type
  26. The style of TextGrid format:
  27. - ooTextFile: Organized by tier.
  28. - ChronTextFile: Organized by time.
  29. - OldooTextFile: Similar to ooTextFile.
  30. - to_chron()
  31. Convert given file to a ChronTextFile format.
  32. - to_oo()
  33. Convert given file to an ooTextFile format.
  34. For each tier:
  35. - text_type
  36. The style of TextGrid format, as above.
  37. - classid
  38. The style of transcription on this tier:
  39. - IntervalTier: Transcription is marked as intervals.
  40. - TextTier: Transcription is marked as single points.
  41. - nameid
  42. The name of the tier.
  43. - xmin
  44. First marked time of the tier.
  45. - xmax
  46. Last marked time of the tier.
  47. - size
  48. Number of entries in the tier.
  49. - transcript
  50. The raw transcript for the tier.
  51. - simple_transcript
  52. The transcript formatted as a list of tuples: (time1, time2, utterance).
  53. - tier_info
  54. List of (classid, nameid, xmin, xmax, size, transcript).
  55. - min_max()
  56. A tuple of (xmin, xmax).
  57. - time(non_speech_marker)
  58. Returns the utterance time of a given tier.
  59. Excludes entries that begin with a non-speech marker.
  60. """
  61. # needs more cleanup, subclassing, epydoc docstrings
  62. import sys
  63. import re
  64. TEXTTIER = "TextTier"
  65. INTERVALTIER = "IntervalTier"
  66. OOTEXTFILE = re.compile(r"""(?x)
  67. xmin\ =\ (.*)[\r\n]+
  68. xmax\ =\ (.*)[\r\n]+
  69. [\s\S]+?size\ =\ (.*)[\r\n]+
  70. """)
  71. CHRONTEXTFILE = re.compile(r"""(?x)
  72. [\r\n]+(\S+)\
  73. (\S+)\ +!\ Time\ domain.\ *[\r\n]+
  74. (\S+)\ +!\ Number\ of\ tiers.\ *[\r\n]+"
  75. """)
  76. OLDOOTEXTFILE = re.compile(r"""(?x)
  77. [\r\n]+(\S+)
  78. [\r\n]+(\S+)
  79. [\r\n]+.+[\r\n]+(\S+)
  80. """)
  81. #################################################################
  82. # TextGrid Class
  83. #################################################################
  84. class TextGrid(object):
  85. """
  86. Class to manipulate the TextGrid format used by Praat.
  87. Separates each tier within this file into its own Tier
  88. object. Each TextGrid object has
  89. a number of tiers (size), xmin, xmax, a text type to help
  90. with the different styles of TextGrid format, and tiers with their
  91. own attributes.
  92. """
  93. def __init__(self, read_file):
  94. """
  95. Takes open read file as input, initializes attributes
  96. of the TextGrid file.
  97. @type read_file: An open TextGrid file, mode "r".
  98. @param size: Number of tiers.
  99. @param xmin: xmin.
  100. @param xmax: xmax.
  101. @param t_time: Total time of TextGrid file.
  102. @param text_type: TextGrid format.
  103. @type tiers: A list of tier objects.
  104. """
  105. self.read_file = read_file
  106. self.size = 0
  107. self.xmin = 0
  108. self.xmax = 0
  109. self.t_time = 0
  110. self.text_type = self._check_type()
  111. self.tiers = self._find_tiers()
  112. def __iter__(self):
  113. for tier in self.tiers:
  114. yield tier
  115. def next(self):
  116. if self.idx == (self.size - 1):
  117. raise StopIteration
  118. self.idx += 1
  119. return self.tiers[self.idx]
  120. @staticmethod
  121. def load(file):
  122. """
  123. @param file: a file in TextGrid format
  124. """
  125. return TextGrid(open(file).read())
  126. def _load_tiers(self, header):
  127. """
  128. Iterates over each tier and grabs tier information.
  129. """
  130. tiers = []
  131. if self.text_type == "ChronTextFile":
  132. m = re.compile(header)
  133. tier_headers = m.findall(self.read_file)
  134. tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\""
  135. for i in range(0, self.size):
  136. tier_info = [tier_headers[i]] + \
  137. re.findall(str(i + 1) + tier_re, self.read_file)
  138. tier_info = "\n".join(tier_info)
  139. tiers.append(Tier(tier_info, self.text_type, self.t_time))
  140. return tiers
  141. tier_re = header + "[\s\S]+?(?=" + header + "|$$)"
  142. m = re.compile(tier_re)
  143. tier_iter = m.finditer(self.read_file)
  144. for iterator in tier_iter:
  145. (begin, end) = iterator.span()
  146. tier_info = self.read_file[begin:end]
  147. tiers.append(Tier(tier_info, self.text_type, self.t_time))
  148. return tiers
  149. def _check_type(self):
  150. """
  151. Figures out the TextGrid format.
  152. """
  153. m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file)
  154. try:
  155. type_id = m.group(1).strip()
  156. except AttributeError:
  157. raise TypeError("Cannot read file -- try TextGrid.load()")
  158. xmin = m.group(4)
  159. if type_id == "File type = \"ooTextFile\"":
  160. if "xmin" not in xmin:
  161. text_type = "OldooTextFile"
  162. else:
  163. text_type = "ooTextFile"
  164. elif type_id == "\"Praat chronological TextGrid text file\"":
  165. text_type = "ChronTextFile"
  166. else:
  167. raise TypeError("Unknown format '(%s)'", (type_id))
  168. return text_type
  169. def _find_tiers(self):
  170. """
  171. Splits the textgrid file into substrings corresponding to tiers.
  172. """
  173. if self.text_type == "ooTextFile":
  174. m = OOTEXTFILE
  175. header = " +item \["
  176. elif self.text_type == "ChronTextFile":
  177. m = CHRONTEXTFILE
  178. header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*"
  179. elif self.text_type == "OldooTextFile":
  180. m = OLDOOTEXTFILE
  181. header = "\".*\"[\r\n]+\".*\""
  182. file_info = m.findall(self.read_file)[0]
  183. self.xmin = float(file_info[0])
  184. self.xmax = float(file_info[1])
  185. self.t_time = self.xmax - self.xmin
  186. self.size = int(file_info[2])
  187. tiers = self._load_tiers(header)
  188. return tiers
  189. def to_chron(self):
  190. """
  191. @return: String in Chronological TextGrid file format.
  192. """
  193. chron_file = ""
  194. chron_file += "\"Praat chronological TextGrid text file\"\n"
  195. chron_file += str(self.xmin) + " " + str(self.xmax)
  196. chron_file += " ! Time domain.\n"
  197. chron_file += str(self.size) + " ! Number of tiers.\n"
  198. for tier in self.tiers:
  199. idx = (self.tiers.index(tier)) + 1
  200. tier_header = "\"" + tier.classid + "\" \"" \
  201. + tier.nameid + "\" " + str(tier.xmin) \
  202. + " " + str(tier.xmax)
  203. chron_file += tier_header + "\n"
  204. transcript = tier.simple_transcript
  205. for (xmin, xmax, utt) in transcript:
  206. chron_file += str(idx) + " " + str(xmin)
  207. chron_file += " " + str(xmax) +"\n"
  208. chron_file += "\"" + utt + "\"\n"
  209. return chron_file
  210. def to_oo(self):
  211. """
  212. @return: A string in OoTextGrid file format.
  213. """
  214. oo_file = ""
  215. oo_file += "File type = \"ooTextFile\"\n"
  216. oo_file += "Object class = \"TextGrid\"\n\n"
  217. oo_file += "xmin = ", self.xmin, "\n"
  218. oo_file += "xmax = ", self.xmax, "\n"
  219. oo_file += "tiers? <exists>\n"
  220. oo_file += "size = ", self.size, "\n"
  221. oo_file += "item []:\n"
  222. for i in range(len(self.tiers)):
  223. oo_file += "%4s%s [%s]" % ("", "item", i + 1)
  224. _curr_tier = self.tiers[i]
  225. for (x, y) in _curr_tier.header:
  226. oo_file += "%8s%s = \"%s\"" % ("", x, y)
  227. if _curr_tier.classid != TEXTTIER:
  228. for (xmin, xmax, text) in _curr_tier.simple_transcript:
  229. oo_file += "%12s%s = %s" % ("", "xmin", xmin)
  230. oo_file += "%12s%s = %s" % ("", "xmax", xmax)
  231. oo_file += "%12s%s = \"%s\"" % ("", "text", text)
  232. else:
  233. for (time, mark) in _curr_tier.simple_transcript:
  234. oo_file += "%12s%s = %s" % ("", "time", time)
  235. oo_file += "%12s%s = %s" % ("", "mark", mark)
  236. return oo_file
  237. #################################################################
  238. # Tier Class
  239. #################################################################
  240. class Tier(object):
  241. """
  242. A container for each tier.
  243. """
  244. def __init__(self, tier, text_type, t_time):
  245. """
  246. Initializes attributes of the tier: class, name, xmin, xmax
  247. size, transcript, total time.
  248. Utilizes text_type to guide how to parse the file.
  249. @type tier: a tier object; single item in the TextGrid list.
  250. @param text_type: TextGrid format
  251. @param t_time: Total time of TextGrid file.
  252. @param classid: Type of tier (point or interval).
  253. @param nameid: Name of tier.
  254. @param xmin: xmin of the tier.
  255. @param xmax: xmax of the tier.
  256. @param size: Number of entries in the tier
  257. @param transcript: The raw transcript for the tier.
  258. """
  259. self.tier = tier
  260. self.text_type = text_type
  261. self.t_time = t_time
  262. self.classid = ""
  263. self.nameid = ""
  264. self.xmin = 0
  265. self.xmax = 0
  266. self.size = 0
  267. self.transcript = ""
  268. self.tier_info = ""
  269. self._make_info()
  270. self.simple_transcript = self.make_simple_transcript()
  271. if self.classid != TEXTTIER:
  272. self.mark_type = "intervals"
  273. else:
  274. self.mark_type = "points"
  275. self.header = [("class", self.classid), ("name", self.nameid), \
  276. ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)]
  277. def __iter__(self):
  278. return self
  279. def _make_info(self):
  280. """
  281. Figures out most attributes of the tier object:
  282. class, name, xmin, xmax, transcript.
  283. """
  284. trans = "([\S\s]*)"
  285. if self.text_type == "ChronTextFile":
  286. classid = "\"(.*)\" +"
  287. nameid = "\"(.*)\" +"
  288. xmin = "(\d+\.?\d*) +"
  289. xmax = "(\d+\.?\d*) *[\r\n]+"
  290. # No size values are given in the Chronological Text File format.
  291. self.size = None
  292. size = ""
  293. elif self.text_type == "ooTextFile":
  294. classid = " +class = \"(.*)\" *[\r\n]+"
  295. nameid = " +name = \"(.*)\" *[\r\n]+"
  296. xmin = " +xmin = (\d+\.?\d*) *[\r\n]+"
  297. xmax = " +xmax = (\d+\.?\d*) *[\r\n]+"
  298. size = " +\S+: size = (\d+) *[\r\n]+"
  299. elif self.text_type == "OldooTextFile":
  300. classid = "\"(.*)\" *[\r\n]+"
  301. nameid = "\"(.*)\" *[\r\n]+"
  302. xmin = "(\d+\.?\d*) *[\r\n]+"
  303. xmax = "(\d+\.?\d*) *[\r\n]+"
  304. size = "(\d+) *[\r\n]+"
  305. m = re.compile(classid + nameid + xmin + xmax + size + trans)
  306. self.tier_info = m.findall(self.tier)[0]
  307. self.classid = self.tier_info[0]
  308. self.nameid = self.tier_info[1]
  309. self.xmin = float(self.tier_info[2])
  310. self.xmax = float(self.tier_info[3])
  311. if self.size != None:
  312. self.size = int(self.tier_info[4])
  313. self.transcript = self.tier_info[-1]
  314. def make_simple_transcript(self):
  315. """
  316. @return: Transcript of the tier, in form [(start_time end_time label)]
  317. """
  318. if self.text_type == "ChronTextFile":
  319. trans_head = ""
  320. trans_xmin = " (\S+)"
  321. trans_xmax = " (\S+)[\r\n]+"
  322. trans_text = "\"([\S\s]*?)\""
  323. elif self.text_type == "ooTextFile":
  324. trans_head = " +\S+ \[\d+\]: *[\r\n]+"
  325. trans_xmin = " +\S+ = (\S+) *[\r\n]+"
  326. trans_xmax = " +\S+ = (\S+) *[\r\n]+"
  327. trans_text = " +\S+ = \"([^\"]*?)\""
  328. elif self.text_type == "OldooTextFile":
  329. trans_head = ""
  330. trans_xmin = "(.*)[\r\n]+"
  331. trans_xmax = "(.*)[\r\n]+"
  332. trans_text = "\"([\S\s]*?)\""
  333. if self.classid == TEXTTIER:
  334. trans_xmin = ""
  335. trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text)
  336. self.simple_transcript = trans_m.findall(self.transcript)
  337. return self.simple_transcript
  338. def transcript(self):
  339. """
  340. @return: Transcript of the tier, as it appears in the file.
  341. """
  342. return self.transcript
  343. def time(self, non_speech_char="."):
  344. """
  345. @return: Utterance time of a given tier.
  346. Screens out entries that begin with a non-speech marker.
  347. """
  348. total = 0.0
  349. if self.classid != TEXTTIER:
  350. for (time1, time2, utt) in self.simple_transcript:
  351. utt = utt.strip()
  352. if utt and not utt[0] == ".":
  353. total += (float(time2) - float(time1))
  354. return total
  355. def tier_name(self):
  356. """
  357. @return: Tier name of a given tier.
  358. """
  359. return self.nameid
  360. def classid(self):
  361. """
  362. @return: Type of transcription on tier.
  363. """
  364. return self.classid
  365. def min_max(self):
  366. """
  367. @return: (xmin, xmax) tuple for a given tier.
  368. """
  369. return (self.xmin, self.xmax)
  370. def __repr__(self):
  371. return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time)
  372. def __str__(self):
  373. return self.__repr__() + "\n " + "\n ".join(" ".join(row) for row in self.simple_transcript)
  374. def demo_TextGrid(demo_data):
  375. print "** Demo of the TextGrid class. **"
  376. fid = TextGrid(demo_data)
  377. print "Tiers:", fid.size
  378. for i, tier in enumerate(fid):
  379. print "\n***"
  380. print "Tier:", i + 1
  381. print tier
  382. def demo():
  383. # Each demo demonstrates different TextGrid formats.
  384. print "Format 1"
  385. demo_TextGrid(demo_data1)
  386. print "\nFormat 2"
  387. demo_TextGrid(demo_data2)
  388. print "\nFormat 3"
  389. demo_TextGrid(demo_data3)
  390. demo_data1 = """File type = "ooTextFile"
  391. Object class = "TextGrid"
  392. xmin = 0
  393. xmax = 2045.144149659864
  394. tiers? <exists>
  395. size = 3
  396. item []:
  397. item [1]:
  398. class = "IntervalTier"
  399. name = "utterances"
  400. xmin = 0
  401. xmax = 2045.144149659864
  402. intervals: size = 5
  403. intervals [1]:
  404. xmin = 0
  405. xmax = 2041.4217474125382
  406. text = ""
  407. intervals [2]:
  408. xmin = 2041.4217474125382
  409. xmax = 2041.968276643991
  410. text = "this"
  411. intervals [3]:
  412. xmin = 2041.968276643991
  413. xmax = 2042.5281632653062
  414. text = "is"
  415. intervals [4]:
  416. xmin = 2042.5281632653062
  417. xmax = 2044.0487352585324
  418. text = "a"
  419. intervals [5]:
  420. xmin = 2044.0487352585324
  421. xmax = 2045.144149659864
  422. text = "demo"
  423. item [2]:
  424. class = "TextTier"
  425. name = "notes"
  426. xmin = 0
  427. xmax = 2045.144149659864
  428. points: size = 3
  429. points [1]:
  430. time = 2041.4217474125382
  431. mark = ".begin_demo"
  432. points [2]:
  433. time = 2043.8338291031832
  434. mark = "voice gets quiet here"
  435. points [3]:
  436. time = 2045.144149659864
  437. mark = ".end_demo"
  438. item [3]:
  439. class = "IntervalTier"
  440. name = "phones"
  441. xmin = 0
  442. xmax = 2045.144149659864
  443. intervals: size = 12
  444. intervals [1]:
  445. xmin = 0
  446. xmax = 2041.4217474125382
  447. text = ""
  448. intervals [2]:
  449. xmin = 2041.4217474125382
  450. xmax = 2041.5438290324326
  451. text = "D"
  452. intervals [3]:
  453. xmin = 2041.5438290324326
  454. xmax = 2041.7321032910372
  455. text = "I"
  456. intervals [4]:
  457. xmin = 2041.7321032910372
  458. xmax = 2041.968276643991
  459. text = "s"
  460. intervals [5]:
  461. xmin = 2041.968276643991
  462. xmax = 2042.232189031843
  463. text = "I"
  464. intervals [6]:
  465. xmin = 2042.232189031843
  466. xmax = 2042.5281632653062
  467. text = "z"
  468. intervals [7]:
  469. xmin = 2042.5281632653062
  470. xmax = 2044.0487352585324
  471. text = "eI"
  472. intervals [8]:
  473. xmin = 2044.0487352585324
  474. xmax = 2044.2487352585324
  475. text = "dc"
  476. intervals [9]:
  477. xmin = 2044.2487352585324
  478. xmax = 2044.3102321849011
  479. text = "d"
  480. intervals [10]:
  481. xmin = 2044.3102321849011
  482. xmax = 2044.5748932104329
  483. text = "E"
  484. intervals [11]:
  485. xmin = 2044.5748932104329
  486. xmax = 2044.8329108578437
  487. text = "m"
  488. intervals [12]:
  489. xmin = 2044.8329108578437
  490. xmax = 2045.144149659864
  491. text = "oU"
  492. """
  493. demo_data2 = """File type = "ooTextFile"
  494. Object class = "TextGrid"
  495. 0
  496. 2.8
  497. <exists>
  498. 2
  499. "IntervalTier"
  500. "utterances"
  501. 0
  502. 2.8
  503. 3
  504. 0
  505. 1.6229213249309031
  506. ""
  507. 1.6229213249309031
  508. 2.341428074708195
  509. "demo"
  510. 2.341428074708195
  511. 2.8
  512. ""
  513. "IntervalTier"
  514. "phones"
  515. 0
  516. 2.8
  517. 6
  518. 0
  519. 1.6229213249309031
  520. ""
  521. 1.6229213249309031
  522. 1.6428291382019483
  523. "dc"
  524. 1.6428291382019483
  525. 1.65372183721983721
  526. "d"
  527. 1.65372183721983721
  528. 1.94372874328943728
  529. "E"
  530. 1.94372874328943728
  531. 2.13821938291038210
  532. "m"
  533. 2.13821938291038210
  534. 2.341428074708195
  535. "oU"
  536. 2.341428074708195
  537. 2.8
  538. ""
  539. """
  540. demo_data3 = """"Praat chronological TextGrid text file"
  541. 0 2.8 ! Time domain.
  542. 2 ! Number of tiers.
  543. "IntervalTier" "utterances" 0 2.8
  544. "IntervalTier" "utterances" 0 2.8
  545. 1 0 1.6229213249309031
  546. ""
  547. 2 0 1.6229213249309031
  548. ""
  549. 2 1.6229213249309031 1.6428291382019483
  550. "dc"
  551. 2 1.6428291382019483 1.65372183721983721
  552. "d"
  553. 2 1.65372183721983721 1.94372874328943728
  554. "E"
  555. 2 1.94372874328943728 2.13821938291038210
  556. "m"
  557. 2 2.13821938291038210 2.341428074708195
  558. "oU"
  559. 1 1.6229213249309031 2.341428074708195
  560. "demo"
  561. 1 2.341428074708195 2.8
  562. ""
  563. 2 2.341428074708195 2.8
  564. ""
  565. """
  566. if __name__ == "__main__":
  567. demo()