PageRenderTime 50ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/cing/python/cing/STAR/Text.py

http://cing.googlecode.com/
Python | 566 lines | 472 code | 19 blank | 75 comment | 20 complexity | 05ce93be4a869a8cdc3e1dcbd7a5de8e MD5 | raw file
  1. """
  2. Classes for dealing with STAR syntax
  3. """
  4. from cing import verbosity
  5. import re
  6. __author__ = "$Author: jurgenfd $"
  7. ___revision__ = "$Revision: 1050 $"
  8. ___date__ = "$Date: 2011-08-11 11:57:27 +0200 (Thu, 11 Aug 2011) $"
  9. #Some handy patterns and functions for dealing with text in the STAR syntax.
  10. #Some are complicated because in Python the none-greedy pattern matching
  11. #gets too recursive and will actually bomb on larger strings. Like the
  12. #following code causes a bomb:
  13. #re.search( 'a.*?c', 'a' + 99999*'b' + 'c' )
  14. #Produces: 'RuntimeError: maximum recursion limit exceeded'
  15. ## When not sure if text can have a ; at start of line use
  16. ## this string prepended to each line.
  17. prepending_string = '[raw] '
  18. NULL_STRING_DOT ='.'
  19. FREE = 0
  20. SINGLE = 1
  21. DOUBLE = 2
  22. singleq = "'"
  23. doubleq = '"'
  24. sharp = '#'
  25. space = ' '
  26. ## Following string will be replacing the eol in a semicolon block where needed
  27. ## It may not contain any funny characters and shouldn't have underscores
  28. ## because it will make parsing slower. Parentheses, if used, should be of the
  29. ## square type.
  30. eol_string = '<eol-string>'
  31. eol_string_length = len(eol_string)
  32. # Redefined below curiously found this bug with code analysis from pydev extensions
  33. # changing the wild import to specific import; that sounds like bad python if it matters.
  34. #pattern_tagtable_loop = re.compile(r"""
  35. #^\s* loop_ \s* # Begin of loop
  36. #( ^\s* (?P<tagname>_\S+) \s*\n )+ # Tag names with some spaces
  37. # (?P<rawtext>.+?) # Tag table raw text
  38. #^\s* stop_ \s*\n # End of loop
  39. # """, re.DOTALL | re.MULTILINE | re.VERBOSE )
  40. pattern_semicolon_block = re.compile(r"""
  41. ^; # semicolon at begin, any text and then eol
  42. .+? # Raw text for match object but not greedy
  43. ^; # semicolon at begin, that's it
  44. """, re.DOTALL | re.MULTILINE | re.VERBOSE )
  45. pattern_eol_string = re.compile( eol_string, re.MULTILINE )
  46. ## Next pattern tells when search for on ONE tagvalue if it needs quotes
  47. pattern_quotes_needed = re.compile( r'[\s\'\"]|^_|^\#' )
  48. ## Next pattern tells when search for on MANY tagvalues if it needs quotes
  49. ## The values should be joined by a comma. A value: 'bla,_bla' will be
  50. ## mentioned as needing quotes unnecessarily but that's dealt with in the code by further checking
  51. pattern_quotes_needed_2= re.compile( r'[\s\'\"]|^_|,_|,\#' )
  52. pattern_eoline_etcet = re.compile( r'[\n\r\v\f]' )
  53. # If the quote character is at the end of the word then it is falsely considered to need a
  54. # different quote style; this happens frequently for e.g. H1' and all nucleic acid sugar atoms.
  55. pattern_single_qoute = re.compile( r"'" )
  56. pattern_double_qoute = re.compile( r'"' )
  57. pattern_save_begin = re.compile('save_(\S+)\s+')
  58. pattern_save_end = re.compile('save_\s*')
  59. pattern_tagtable_loop = re.compile("loop_\s*" )
  60. pattern_tagtable_stop = re.compile("stop_\s*" )
  61. # Same thing but not eating all white space chars, just a minimal match
  62. pattern_save_begin_nws = re.compile('save_\S')
  63. # Pattern extended to include matches to "save_" as the last characters in a file.
  64. # in other words; without a end of line.
  65. pattern_save_end_nws = re.compile('(?:save_\s)|(?:save_$)')
  66. #pattern_save_end_nws = re.compile('save_\s')
  67. pattern_tagtable_loop_nws = re.compile('loop_\s')
  68. pattern_tag_name_nws = re.compile('_\S')
  69. # Same thing but requiring a prefixed white space char:
  70. ##pattern_sf_begin_or_end = re.compile('\ssave_')
  71. pattern_tagtable_loop_2 = re.compile('\sloop_\s+' )
  72. pattern_tagtable_stop_2 = re.compile('\sstop_\s+' )
  73. pattern_tagname_2 = re.compile('\s_\S+\s+' )
  74. pattern_tag_name = re.compile(r"""(_\S+) \s+
  75. """, re.DOTALL | re.MULTILINE | re.VERBOSE )
  76. pattern_tags_loop = re.compile(r"""(?: (_\S+) \s* )+
  77. """, re.MULTILINE | re.VERBOSE )
  78. pattern_tags_loop_2 = re.compile(r""" (_\S+) \s*
  79. """, re.MULTILINE | re.VERBOSE )
  80. ## Get any number of non-white space characters followed by any white space
  81. pattern_word = re.compile(r"""(\S+)\s*""", re.MULTILINE )
  82. pattern_quoted = re.compile(r"""
  83. ['"] | # single or double quote
  84. (?: ^ ; ) # semicolon at the beginning of a line
  85. """, re.MULTILINE | re.VERBOSE )
  86. pattern_quoted_2 = re.compile(r"""(?: \b [\'\"] ) | (?: ^ \; )""", re.MULTILINE | re.VERBOSE )
  87. pattern_s_quote = re.compile(r"""\'\s+""", re.MULTILINE )
  88. pattern_d_quote = re.compile(r"""\"\s+""", re.MULTILINE )
  89. pattern_e_semicolon = re.compile( eol_string + r"""\;\s*""", re.MULTILINE ) # Added \n for better parsing Wim 01/11/05
  90. # Set beginning of line BEFORE whitespace - Wim 06/03/2003
  91. #pattern_comment_begin = re.compile (r"""^\s*\#.*\n # A string starting a line with a sharp
  92. # """, re.MULTILINE | re.VERBOSE)
  93. pattern_nmrView_compress_empty = re.compile(r""" \{(\s+)\}
  94. """, re.MULTILINE | re.VERBOSE)
  95. pattern_nmrView_compress_questionmark = re.compile(r""" \{(\s+\?)\}
  96. """, re.MULTILINE | re.VERBOSE)
  97. # JFD old's
  98. #pattern_comment_middle = re.compile (r"""(^[^;^\n] .*? ) # Any string beginning a line other than with a semicolon
  99. # (\s \# .* $ ) # Any string ending a line and starting with a sharp
  100. # """, re.MULTILINE | re.VERBOSE)
  101. # Wim's:
  102. #pattern_comment_middle = re.compile (
  103. # r""" ( # start group 1 that will be captured for replay.
  104. # ^[^;^\n] # not a what?
  105. # (?: # start a non-capturing group
  106. # ( # start group 2 (capturing?)
  107. # [\'][^\']*\#[^\']*[\'] | # get '<text>#<text>'
  108. # [\"][^\"]*\#[^\"]*[\"] # get "<text>#<text>"
  109. # ) |
  110. # [^\#.]
  111. # )*?
  112. # )
  113. # # Any string beginning a line other than with a semicolon and with no quotes in it
  114. # (\s+\#.*)? $ # the comment to be deleted.
  115. # # Any string ending a line and starting with a sharp
  116. # """, re.MULTILINE | re.VERBOSE)
  117. # # Hashes in quotes don't count!
  118. # # (?:[\'\"][^\'^\".]*\#[^\'^\".]*[\'\"]|[^\#.])*? ) expression gets '<text>#<text>' blocks,
  119. # # is now built into multiline search, seems to be working... (Wim 11/02)
  120. # # Changed \s* to \s+ - comments can only start with a ' ' before the '#' (Wim 05/03)
  121. # # Removed . from [^\'^\".] in regular expression described above: more generic (Wim 05/03)
  122. # doesn't catch"""H# # comment""" see testcomments_strip3a
  123. # doesn't catch"""
  124. #;
  125. #foo # comment
  126. #;"""
  127. def pattern_unquoted_find(text, pattern, pos=0):
  128. """
  129. Searches for a regular expression in text.
  130. The text may not be STAR quoted and must have semicolon blocks collapsed
  131. such that the semicolon starts at the beginning of the line.
  132. Returns the start position of the match or -1 if it was not found or
  133. None if there was an error.
  134. The function will search the text from given position onwards
  135. and checks the chars preceding (up to the line it's in) for quote style.
  136. WARNINGS:
  137. - Don't call it for a text that has no \n and at least 1 other
  138. character in it before pos (not fully tested; perhaps possible).
  139. - I have not put in extra checks because of needed speed.
  140. - No requirements set on what follows the pattern.
  141. """
  142. while 1:
  143. match = pattern.search( text, pos)
  144. if not match:
  145. ## No match at all
  146. return -1
  147. pos = match.start()
  148. ## Is it the beginning of the string
  149. if pos == 0:
  150. return 0
  151. ## Is the first character matched an eol it self
  152. if text[pos]=='\n':
  153. if verbosity >= 9:
  154. print 'Found pattern: [%s] at the beginning of a line' % pattern.pattern
  155. return pos
  156. ## I hope the rfind is optimized to stroll backwards from pos
  157. pos_end_of_previous_line = text.rfind('\n', 0, pos)
  158. if pos_end_of_previous_line == -1:
  159. pos_end_of_previous_line = -1 ## Dangerous rewind?
  160. line = text[pos_end_of_previous_line+1:pos]
  161. # Some dummy value but continue with the test below.
  162. if line == '':
  163. line = ' '
  164. # Not the one
  165. if line[0] == ';':
  166. if verbosity > 1:
  167. print 'WARNING: (1) found pattern: [%s] preceded by: [%s]' % (
  168. pattern.pattern, line )
  169. pos = pos + 1
  170. continue
  171. squoted = None
  172. dquoted = None
  173. for i in line:
  174. if i == "'":
  175. if not dquoted:
  176. squoted = not squoted
  177. elif i == '"':
  178. if not squoted:
  179. dquoted = not dquoted
  180. if squoted or dquoted:
  181. ## if squoted and dquoted:
  182. ## ## Should not be possible to occur, delete when confident
  183. ## print "ERROR: code error, mixing of quote styles in line:"
  184. ## print "ERROR: [%s]" % line
  185. ## return None
  186. if verbosity > 1:
  187. print 'WARNING: (2) found pattern: [%s] preceded by: [%s]' % (
  188. pattern.pattern, line )
  189. # Not the one
  190. pos = pos + 1
  191. continue
  192. return pos
  193. def tag_value_quoted_parse( text, pos ):
  194. """
  195. Parse one quoted tag value beginning from position: pos
  196. Return the value and the position of the 'cursor' behind the
  197. value for the first non white space char.
  198. In case of error the position value of None will signal failure.
  199. """
  200. # print 'text: [%s]' % text[pos:pos+80]
  201. # print 'pos: [%s]' % pos
  202. if text[ pos ] == '"':
  203. match_d_quote = pattern_d_quote.search( text, pos+1)
  204. if not match_d_quote:
  205. print "ERROR: No matching double quote char found for double quote char at offset:", 0
  206. print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
  207. return None, None
  208. ## if verbosity >= 9:
  209. ## print "pos, span():", pos, match_d_quote.span()
  210. ## print 'Found Q tag value: [%s]' % text[ pos+1:match_d_quote.start() ]
  211. return text[ pos+1:match_d_quote.start() ], match_d_quote.end()
  212. if text[ pos ] == "'":
  213. match_s_quote = pattern_s_quote.search( text, pos+1)
  214. if not match_s_quote:
  215. print "ERROR: No matching single quote char found for single quote char at offset:", 0
  216. print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
  217. return None, None
  218. value = text[ pos+1:match_s_quote.start() ]
  219. ## if verbosity >= 9:
  220. ## print "pos, span():", pos, match_s_quote.span()
  221. ## print 'Found Q tag value: [%s]' % value
  222. return value, match_s_quote.end()
  223. ## Remove check for speed if you want
  224. ## This should always be true
  225. if text[ pos ] == ";":
  226. match_e_semicolon = pattern_e_semicolon.search( text, pos+1)
  227. if not match_e_semicolon:
  228. print "ERROR: No matching semicolon found for semicolon char at offset:", 0
  229. print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
  230. return None, None
  231. ## print "pos, span():", pos, match_e_semicolon.span()
  232. ## Include the first eol and the eol before the semicolon
  233. value = text[ pos+1:match_e_semicolon.start()+eol_string_length ]
  234. ## Expansion relatively cheap here and harmless if unique string as defined in
  235. ## eol_string is indeed unique
  236. ## print 'Found Q (semicolon) tag value: unexpanded [%s]' % value
  237. ## print '-----------'
  238. ## print text[ match_e_semicolon.start()+eol_string_length : match_e_semicolon.start()+eol_string_length + 20]
  239. ## print '-----------'
  240. value = semicolon_block_expand( value )
  241. ## print 'Found Q (semicolon) tag value: expanded [%s]' % value
  242. return value, match_e_semicolon.end()
  243. print "ERROR: Position in text:", pos
  244. print """ERROR: should contain a ', ", or a ; but was not found:"""
  245. print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
  246. return None, None
  247. def tag_value_parse( text, pos):
  248. """
  249. From text on position pos, read a tag value and return the value and
  250. position of the next non-space char. This is the slow parsing method
  251. that should only be used for free tags.
  252. """
  253. match_quoted = pattern_quoted.search( text, pos )
  254. if match_quoted:
  255. if match_quoted.start() == pos:
  256. return tag_value_quoted_parse( text, pos ) # Better speed with this code
  257. match_word = pattern_word.search( text, pos )
  258. if not match_word:
  259. print "ERROR: No match for a 'word' at offset:", pos
  260. print "ERROR: Next 70 chars are:", text[ pos:pos+70 ]
  261. return None, None
  262. if match_word.start() != pos:
  263. print "ERROR: Match for a 'word' at wrong offset:", match_word.start() - pos
  264. print "ERROR: Next 70 chars are:", text[ pos:pos+70 ]
  265. return None, None
  266. ## Include the first eol and the eol before the semicolon
  267. return match_word.group(1), match_word.end()
  268. def semicolon_block_replace( matchobj ):
  269. """
  270. See function semicolon_block_collapse that calls this one
  271. """
  272. #print len(matchobj.group())
  273. return re.sub( '\n', eol_string, matchobj.group() )
  274. def semicolon_block_collapse( text ):
  275. """
  276. This function should be called (not semicolon_block_replace)
  277. Putting all semicolon separated values on one line
  278. by replacing the eol within with a unique key value
  279. that is to be remove later on by it's sibling method:
  280. semicolon_block_expand.
  281. SPEED: 0.6 cpu seconds for a 5 Mb file with 31 blocks and
  282. 1.3 " 10 " 64 ".
  283. """
  284. # count = 0
  285. startpos = 0
  286. pattern_semicolon_only = re.compile("^\;", re.MULTILINE)
  287. # Added special _end pattern with $ for better pattern matching - Wim 31/10/2005
  288. pattern_semicolon_only_end = re.compile("(^\;\s*$)", re.MULTILINE)
  289. semicolon_start = pattern_semicolon_only.search(text[startpos:])
  290. while(semicolon_start):
  291. # count += 1
  292. startpos = startpos + semicolon_start.start()
  293. semicolon_end = pattern_semicolon_only_end.search(text[startpos+1:])
  294. try:
  295. endpos = startpos + 1 + semicolon_end.end() - len(semicolon_end.group(1)) + 1
  296. except:
  297. print "ERROR in semicolon_block_collapse for text starting at: ["+ text[startpos:startpos+100]+ "]"
  298. raise
  299. text_replace = re.sub("\n", eol_string,text[startpos:endpos])
  300. # This is bulky and not very elegant but works
  301. text= text[0:startpos] + text_replace + text[endpos:]
  302. startpos = startpos + len(text_replace)
  303. semicolon_start = pattern_semicolon_only.search(text[startpos:])
  304. # Original code: can't handle re matches that are too long
  305. #text, count = pattern_semicolon_block.subn( semicolon_block_replace, text )
  306. # nTdebug('Done [%s] subs with semicolon blocks' % count)
  307. return text
  308. def semicolon_block_expand( text ):
  309. return pattern_eol_string.sub('\n', text )
  310. def quotes_add( text ):
  311. """
  312. Adds semicolons, single quotes or double quotes depending on
  313. need according to star syntax.
  314. Does not assume that no quotes exist initially and will strip them if
  315. present in pairs only.
  316. If the possible_bad_char parameter is set (to 1 or higher) then
  317. strings that would normally end up in a semicolon delimited blob will
  318. have a string inserted at the beginning to it. The string can be the 'p'
  319. argument to this function. [TODO]
  320. """
  321. preferred_quote='"' # This info should be in a more central spot
  322. if pattern_eoline_etcet.search( text ):
  323. return semicolons_add( text )
  324. if pattern_single_qoute.search( text ):
  325. single_qoute_match = 1
  326. else:
  327. single_qoute_match = 0
  328. if pattern_double_qoute.search( text ):
  329. double_qoute_match = 1
  330. else:
  331. double_qoute_match = 0
  332. if single_qoute_match and double_qoute_match:
  333. return semicolons_add( text )
  334. if single_qoute_match:
  335. return '"' + text + '"'
  336. # Commented out because it leads to the same behaviour
  337. if double_qoute_match:
  338. return "'" + text + "'"
  339. ## Space other than end of line, or # sign etc.
  340. return preferred_quote + text + preferred_quote
  341. def quotes_strip( text ):
  342. "Strips quotes in pairs and returns new/old string"
  343. ## Can it be containing quotes?
  344. if len(text) <= 1:
  345. return text
  346. for quote_symbol in [ "\'", '\"' ]:
  347. if ( text[0] == quote_symbol and
  348. text[-1] == quote_symbol ):
  349. return text[1:-1]
  350. return text
  351. def semicolons_add( text, possible_bad_char=None ):
  352. """
  353. Returns the input with ; delimited, possibly with a string inserted at the beginning.
  354. The string value should always be ended by a eol, otherwise
  355. the second semicolon can not be the first char on a line.
  356. """
  357. if possible_bad_char:
  358. lines = text.split('\n')
  359. text = ''
  360. for line in lines:
  361. text = text + prepending_string + line + '\n'
  362. ## ## Code repeated for speed
  363. ## return "\n;" + text + ";\n"
  364. ## else:
  365. ## return "\n;" + text + ";\n"
  366. #JFD updates 5/23/2006; apparently the text does not always end with an eol.
  367. if not text.endswith('\n'):
  368. text = text + '\n'
  369. return "\n;\n" + text + ";\n"
  370. def comments_strip( text ):
  371. """
  372. Strip the STAR comments new style
  373. """
  374. lines = text.split( "\n" )
  375. i=0
  376. # count = 0
  377. ls = len(lines)
  378. # print "DEBUG: processing lines: ", ls
  379. while i<ls:
  380. # print "DEBUG: processing A line: ", i
  381. line = lines[i]
  382. # Scan past semi colon blocks.
  383. n = len(line)
  384. if n < 1:
  385. # print "DEBUG: skipping empty line: "
  386. i += 1
  387. continue
  388. if line[0] == ';': # start a semicolon block
  389. # print "DEBUG: found start of semi colon block."
  390. i += 1
  391. line = lines[i]
  392. # print "DEBUG: processing B line: ", i
  393. while len(line)==0 or line[0] != ';':
  394. i += 1
  395. line = lines[i]
  396. # print "DEBUG: processing C line: ", i
  397. # end a semicolon block
  398. else:
  399. line = _comments_strip_line(line)
  400. if len(line) != n:
  401. lines[i] = line
  402. # print "Changed from lenght",n,"to line: ["+line+"] at:", i
  403. # count += 1
  404. i += 1
  405. # if verbosity >= 9:
  406. # nTdebug( 'Done [%s] comment subs' % count )
  407. if lines:
  408. text = "\n".join(lines)
  409. # text = lines.join("\n")
  410. return text
  411. def _comments_strip_line( line ):
  412. """
  413. Strip the STAR comments for a single line.
  414. """
  415. c=0
  416. state = FREE # like to start out free which is possible after donning semicolon blocks.
  417. n = len(line)
  418. while c < n: # parse range [0,n> where n is length and exclusive.
  419. ch=line[c]
  420. # print "DEBUG: Processing char '"+ch+"' at "+repr(c)+" in state:", state
  421. if ( ch == sharp and state == FREE and # A sharp in FREE state
  422. (c==0 or line[c-1].isspace())): # behind a space or at beginning of a line.
  423. # print "DEBUG: Found sharpie"
  424. if c==0:
  425. return ''
  426. return line[0:c] # this is fast.
  427. if c==n-1: # c is the last character; leave it alone if it's not a sharpie
  428. return line
  429. if ch == doubleq:
  430. if (state == FREE and # new " behind space or at beginning of line
  431. (c==0 or line[c-1].isspace())):
  432. state = DOUBLE
  433. elif state == DOUBLE:
  434. if line[c+1].isspace(): # garanteed to exist now.
  435. state = FREE
  436. elif ch == singleq:
  437. if (state == FREE and
  438. (c==0 or line[c-1].isspace())):
  439. state = SINGLE
  440. elif state == SINGLE:
  441. if line[c+1].isspace():
  442. state = FREE
  443. c += 1
  444. return line
  445. def isStarNan(starValue):
  446. if NULL_STRING_DOT == starValue:
  447. return True
  448. # if '?' == starValue:
  449. # return True
  450. if None == starValue:
  451. return True
  452. return False
  453. def translateStarNanToNone(starValue):
  454. if isStarNan(starValue):
  455. return None
  456. return starValue
  457. #def comments_stripOld( text ):
  458. # # split for profiling
  459. # text = _comments_strip1(text)
  460. # text = _comments_strip2(text)
  461. # return text
  462. #
  463. #def _comments_strip1( text ):
  464. # text, count = pattern_comment_begin.subn( '', text )
  465. # if verbosity >= 9:
  466. # print 'Done [%s] subs with comment at beginning of line' % count
  467. # return text
  468. #
  469. #def _comments_strip2( text ):
  470. # text, count = pattern_comment_middle.subn( '\g<1>', text )
  471. # if verbosity >= 9:
  472. # print 'Done [%s] subs with comment not at beginning of line' % count
  473. # return text
  474. def nmrView_compress( text ):
  475. text, count = pattern_nmrView_compress_empty.subn( '{}', text )
  476. print 'Compressed [%s] nmrView empty { } tags' % count
  477. text, count = pattern_nmrView_compress_questionmark.subn( '{?}', text )
  478. print 'Compressed [%s] nmrView question mark { ?} tags' % count
  479. return text