/cing/python/cing/STAR/Text.py
Python | 566 lines | 472 code | 19 blank | 75 comment | 20 complexity | 05ce93be4a869a8cdc3e1dcbd7a5de8e MD5 | raw file
- """
- Classes for dealing with STAR syntax
- """
- from cing import verbosity
- import re
- __author__ = "$Author: jurgenfd $"
- ___revision__ = "$Revision: 1050 $"
- ___date__ = "$Date: 2011-08-11 11:57:27 +0200 (Thu, 11 Aug 2011) $"
- #Some handy patterns and functions for dealing with text in the STAR syntax.
- #Some are complicated because in Python the none-greedy pattern matching
- #gets too recursive and will actually bomb on larger strings. Like the
- #following code causes a bomb:
- #re.search( 'a.*?c', 'a' + 99999*'b' + 'c' )
- #Produces: 'RuntimeError: maximum recursion limit exceeded'
- ## When not sure if text can have a ; at start of line use
- ## this string prepended to each line.
- prepending_string = '[raw] '
- NULL_STRING_DOT ='.'
- FREE = 0
- SINGLE = 1
- DOUBLE = 2
- singleq = "'"
- doubleq = '"'
- sharp = '#'
- space = ' '
- ## Following string will be replacing the eol in a semicolon block where needed
- ## It may not contain any funny characters and shouldn't have underscores
- ## because it will make parsing slower. Parentheses, if used, should be of the
- ## square type.
- eol_string = '<eol-string>'
- eol_string_length = len(eol_string)
- # Redefined below curiously found this bug with code analysis from pydev extensions
- # changing the wild import to specific import; that sounds like bad python if it matters.
- #pattern_tagtable_loop = re.compile(r"""
- #^\s* loop_ \s* # Begin of loop
- #( ^\s* (?P<tagname>_\S+) \s*\n )+ # Tag names with some spaces
- # (?P<rawtext>.+?) # Tag table raw text
- #^\s* stop_ \s*\n # End of loop
- # """, re.DOTALL | re.MULTILINE | re.VERBOSE )
- pattern_semicolon_block = re.compile(r"""
- ^; # semicolon at begin, any text and then eol
- .+? # Raw text for match object but not greedy
- ^; # semicolon at begin, that's it
- """, re.DOTALL | re.MULTILINE | re.VERBOSE )
- pattern_eol_string = re.compile( eol_string, re.MULTILINE )
- ## Next pattern tells when search for on ONE tagvalue if it needs quotes
- pattern_quotes_needed = re.compile( r'[\s\'\"]|^_|^\#' )
- ## Next pattern tells when search for on MANY tagvalues if it needs quotes
- ## The values should be joined by a comma. A value: 'bla,_bla' will be
- ## mentioned as needing quotes unnecessarily but that's dealt with in the code by further checking
- pattern_quotes_needed_2= re.compile( r'[\s\'\"]|^_|,_|,\#' )
- pattern_eoline_etcet = re.compile( r'[\n\r\v\f]' )
- # If the quote character is at the end of the word then it is falsely considered to need a
- # different quote style; this happens frequently for e.g. H1' and all nucleic acid sugar atoms.
- pattern_single_qoute = re.compile( r"'" )
- pattern_double_qoute = re.compile( r'"' )
- pattern_save_begin = re.compile('save_(\S+)\s+')
- pattern_save_end = re.compile('save_\s*')
- pattern_tagtable_loop = re.compile("loop_\s*" )
- pattern_tagtable_stop = re.compile("stop_\s*" )
- # Same thing but not eating all white space chars, just a minimal match
- pattern_save_begin_nws = re.compile('save_\S')
- # Pattern extended to include matches to "save_" as the last characters in a file.
- # in other words; without a end of line.
- pattern_save_end_nws = re.compile('(?:save_\s)|(?:save_$)')
- #pattern_save_end_nws = re.compile('save_\s')
- pattern_tagtable_loop_nws = re.compile('loop_\s')
- pattern_tag_name_nws = re.compile('_\S')
- # Same thing but requiring a prefixed white space char:
- ##pattern_sf_begin_or_end = re.compile('\ssave_')
- pattern_tagtable_loop_2 = re.compile('\sloop_\s+' )
- pattern_tagtable_stop_2 = re.compile('\sstop_\s+' )
- pattern_tagname_2 = re.compile('\s_\S+\s+' )
- pattern_tag_name = re.compile(r"""(_\S+) \s+
- """, re.DOTALL | re.MULTILINE | re.VERBOSE )
- pattern_tags_loop = re.compile(r"""(?: (_\S+) \s* )+
- """, re.MULTILINE | re.VERBOSE )
- pattern_tags_loop_2 = re.compile(r""" (_\S+) \s*
- """, re.MULTILINE | re.VERBOSE )
- ## Get any number of non-white space characters followed by any white space
- pattern_word = re.compile(r"""(\S+)\s*""", re.MULTILINE )
- pattern_quoted = re.compile(r"""
- ['"] | # single or double quote
- (?: ^ ; ) # semicolon at the beginning of a line
- """, re.MULTILINE | re.VERBOSE )
- pattern_quoted_2 = re.compile(r"""(?: \b [\'\"] ) | (?: ^ \; )""", re.MULTILINE | re.VERBOSE )
- pattern_s_quote = re.compile(r"""\'\s+""", re.MULTILINE )
- pattern_d_quote = re.compile(r"""\"\s+""", re.MULTILINE )
- pattern_e_semicolon = re.compile( eol_string + r"""\;\s*""", re.MULTILINE ) # Added \n for better parsing Wim 01/11/05
- # Set beginning of line BEFORE whitespace - Wim 06/03/2003
- #pattern_comment_begin = re.compile (r"""^\s*\#.*\n # A string starting a line with a sharp
- # """, re.MULTILINE | re.VERBOSE)
- pattern_nmrView_compress_empty = re.compile(r""" \{(\s+)\}
- """, re.MULTILINE | re.VERBOSE)
- pattern_nmrView_compress_questionmark = re.compile(r""" \{(\s+\?)\}
- """, re.MULTILINE | re.VERBOSE)
- # JFD old's
- #pattern_comment_middle = re.compile (r"""(^[^;^\n] .*? ) # Any string beginning a line other than with a semicolon
- # (\s \# .* $ ) # Any string ending a line and starting with a sharp
- # """, re.MULTILINE | re.VERBOSE)
- # Wim's:
- #pattern_comment_middle = re.compile (
- # r""" ( # start group 1 that will be captured for replay.
- # ^[^;^\n] # not a what?
- # (?: # start a non-capturing group
- # ( # start group 2 (capturing?)
- # [\'][^\']*\#[^\']*[\'] | # get '<text>#<text>'
- # [\"][^\"]*\#[^\"]*[\"] # get "<text>#<text>"
- # ) |
- # [^\#.]
- # )*?
- # )
- # # Any string beginning a line other than with a semicolon and with no quotes in it
- # (\s+\#.*)? $ # the comment to be deleted.
- # # Any string ending a line and starting with a sharp
- # """, re.MULTILINE | re.VERBOSE)
- # # Hashes in quotes don't count!
- # # (?:[\'\"][^\'^\".]*\#[^\'^\".]*[\'\"]|[^\#.])*? ) expression gets '<text>#<text>' blocks,
- # # is now built into multiline search, seems to be working... (Wim 11/02)
- # # Changed \s* to \s+ - comments can only start with a ' ' before the '#' (Wim 05/03)
- # # Removed . from [^\'^\".] in regular expression described above: more generic (Wim 05/03)
- # doesn't catch"""H# # comment""" see testcomments_strip3a
- # doesn't catch"""
- #;
- #foo # comment
- #;"""
- def pattern_unquoted_find(text, pattern, pos=0):
- """
- Searches for a regular expression in text.
- The text may not be STAR quoted and must have semicolon blocks collapsed
- such that the semicolon starts at the beginning of the line.
- Returns the start position of the match or -1 if it was not found or
- None if there was an error.
-
- The function will search the text from given position onwards
- and checks the chars preceding (up to the line it's in) for quote style.
-
- WARNINGS:
- - Don't call it for a text that has no \n and at least 1 other
- character in it before pos (not fully tested; perhaps possible).
- - I have not put in extra checks because of needed speed.
- - No requirements set on what follows the pattern.
- """
- while 1:
- match = pattern.search( text, pos)
- if not match:
- ## No match at all
- return -1
- pos = match.start()
- ## Is it the beginning of the string
- if pos == 0:
- return 0
- ## Is the first character matched an eol it self
- if text[pos]=='\n':
- if verbosity >= 9:
- print 'Found pattern: [%s] at the beginning of a line' % pattern.pattern
- return pos
- ## I hope the rfind is optimized to stroll backwards from pos
- pos_end_of_previous_line = text.rfind('\n', 0, pos)
- if pos_end_of_previous_line == -1:
- pos_end_of_previous_line = -1 ## Dangerous rewind?
- line = text[pos_end_of_previous_line+1:pos]
- # Some dummy value but continue with the test below.
- if line == '':
- line = ' '
- # Not the one
- if line[0] == ';':
- if verbosity > 1:
- print 'WARNING: (1) found pattern: [%s] preceded by: [%s]' % (
- pattern.pattern, line )
- pos = pos + 1
- continue
- squoted = None
- dquoted = None
- for i in line:
- if i == "'":
- if not dquoted:
- squoted = not squoted
- elif i == '"':
- if not squoted:
- dquoted = not dquoted
- if squoted or dquoted:
- ## if squoted and dquoted:
- ## ## Should not be possible to occur, delete when confident
- ## print "ERROR: code error, mixing of quote styles in line:"
- ## print "ERROR: [%s]" % line
- ## return None
- if verbosity > 1:
- print 'WARNING: (2) found pattern: [%s] preceded by: [%s]' % (
- pattern.pattern, line )
- # Not the one
- pos = pos + 1
- continue
- return pos
- def tag_value_quoted_parse( text, pos ):
- """
- Parse one quoted tag value beginning from position: pos
- Return the value and the position of the 'cursor' behind the
- value for the first non white space char.
- In case of error the position value of None will signal failure.
- """
- # print 'text: [%s]' % text[pos:pos+80]
- # print 'pos: [%s]' % pos
- if text[ pos ] == '"':
- match_d_quote = pattern_d_quote.search( text, pos+1)
- if not match_d_quote:
- print "ERROR: No matching double quote char found for double quote char at offset:", 0
- print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
- return None, None
- ## if verbosity >= 9:
- ## print "pos, span():", pos, match_d_quote.span()
- ## print 'Found Q tag value: [%s]' % text[ pos+1:match_d_quote.start() ]
- return text[ pos+1:match_d_quote.start() ], match_d_quote.end()
- if text[ pos ] == "'":
- match_s_quote = pattern_s_quote.search( text, pos+1)
- if not match_s_quote:
- print "ERROR: No matching single quote char found for single quote char at offset:", 0
- print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
- return None, None
- value = text[ pos+1:match_s_quote.start() ]
- ## if verbosity >= 9:
- ## print "pos, span():", pos, match_s_quote.span()
- ## print 'Found Q tag value: [%s]' % value
- return value, match_s_quote.end()
- ## Remove check for speed if you want
- ## This should always be true
- if text[ pos ] == ";":
- match_e_semicolon = pattern_e_semicolon.search( text, pos+1)
- if not match_e_semicolon:
- print "ERROR: No matching semicolon found for semicolon char at offset:", 0
- print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
- return None, None
- ## print "pos, span():", pos, match_e_semicolon.span()
- ## Include the first eol and the eol before the semicolon
- value = text[ pos+1:match_e_semicolon.start()+eol_string_length ]
- ## Expansion relatively cheap here and harmless if unique string as defined in
- ## eol_string is indeed unique
- ## print 'Found Q (semicolon) tag value: unexpanded [%s]' % value
- ## print '-----------'
- ## print text[ match_e_semicolon.start()+eol_string_length : match_e_semicolon.start()+eol_string_length + 20]
- ## print '-----------'
- value = semicolon_block_expand( value )
- ## print 'Found Q (semicolon) tag value: expanded [%s]' % value
- return value, match_e_semicolon.end()
- print "ERROR: Position in text:", pos
- print """ERROR: should contain a ', ", or a ; but was not found:"""
- print "ERROR: Next 70 chars are: [%s]" % text[ pos:pos+70 ]
- return None, None
- def tag_value_parse( text, pos):
- """
- From text on position pos, read a tag value and return the value and
- position of the next non-space char. This is the slow parsing method
- that should only be used for free tags.
- """
- match_quoted = pattern_quoted.search( text, pos )
- if match_quoted:
- if match_quoted.start() == pos:
- return tag_value_quoted_parse( text, pos ) # Better speed with this code
- match_word = pattern_word.search( text, pos )
- if not match_word:
- print "ERROR: No match for a 'word' at offset:", pos
- print "ERROR: Next 70 chars are:", text[ pos:pos+70 ]
- return None, None
- if match_word.start() != pos:
- print "ERROR: Match for a 'word' at wrong offset:", match_word.start() - pos
- print "ERROR: Next 70 chars are:", text[ pos:pos+70 ]
- return None, None
- ## Include the first eol and the eol before the semicolon
- return match_word.group(1), match_word.end()
- def semicolon_block_replace( matchobj ):
- """
- See function semicolon_block_collapse that calls this one
- """
- #print len(matchobj.group())
- return re.sub( '\n', eol_string, matchobj.group() )
- def semicolon_block_collapse( text ):
- """
- This function should be called (not semicolon_block_replace)
- Putting all semicolon separated values on one line
- by replacing the eol within with a unique key value
- that is to be remove later on by it's sibling method:
- semicolon_block_expand.
- SPEED: 0.6 cpu seconds for a 5 Mb file with 31 blocks and
- 1.3 " 10 " 64 ".
- """
- # count = 0
- startpos = 0
- pattern_semicolon_only = re.compile("^\;", re.MULTILINE)
- # Added special _end pattern with $ for better pattern matching - Wim 31/10/2005
- pattern_semicolon_only_end = re.compile("(^\;\s*$)", re.MULTILINE)
- semicolon_start = pattern_semicolon_only.search(text[startpos:])
- while(semicolon_start):
- # count += 1
- startpos = startpos + semicolon_start.start()
- semicolon_end = pattern_semicolon_only_end.search(text[startpos+1:])
- try:
- endpos = startpos + 1 + semicolon_end.end() - len(semicolon_end.group(1)) + 1
- except:
- print "ERROR in semicolon_block_collapse for text starting at: ["+ text[startpos:startpos+100]+ "]"
- raise
- text_replace = re.sub("\n", eol_string,text[startpos:endpos])
- # This is bulky and not very elegant but works
- text= text[0:startpos] + text_replace + text[endpos:]
- startpos = startpos + len(text_replace)
- semicolon_start = pattern_semicolon_only.search(text[startpos:])
- # Original code: can't handle re matches that are too long
- #text, count = pattern_semicolon_block.subn( semicolon_block_replace, text )
- # nTdebug('Done [%s] subs with semicolon blocks' % count)
- return text
- def semicolon_block_expand( text ):
- return pattern_eol_string.sub('\n', text )
- def quotes_add( text ):
- """
- Adds semicolons, single quotes or double quotes depending on
- need according to star syntax.
- Does not assume that no quotes exist initially and will strip them if
- present in pairs only.
-
- If the possible_bad_char parameter is set (to 1 or higher) then
- strings that would normally end up in a semicolon delimited blob will
- have a string inserted at the beginning to it. The string can be the 'p'
- argument to this function. [TODO]
- """
- preferred_quote='"' # This info should be in a more central spot
- if pattern_eoline_etcet.search( text ):
- return semicolons_add( text )
- if pattern_single_qoute.search( text ):
- single_qoute_match = 1
- else:
- single_qoute_match = 0
- if pattern_double_qoute.search( text ):
- double_qoute_match = 1
- else:
- double_qoute_match = 0
- if single_qoute_match and double_qoute_match:
- return semicolons_add( text )
- if single_qoute_match:
- return '"' + text + '"'
- # Commented out because it leads to the same behaviour
- if double_qoute_match:
- return "'" + text + "'"
- ## Space other than end of line, or # sign etc.
- return preferred_quote + text + preferred_quote
- def quotes_strip( text ):
- "Strips quotes in pairs and returns new/old string"
- ## Can it be containing quotes?
- if len(text) <= 1:
- return text
- for quote_symbol in [ "\'", '\"' ]:
- if ( text[0] == quote_symbol and
- text[-1] == quote_symbol ):
- return text[1:-1]
- return text
- def semicolons_add( text, possible_bad_char=None ):
- """
- Returns the input with ; delimited, possibly with a string inserted at the beginning.
- The string value should always be ended by a eol, otherwise
- the second semicolon can not be the first char on a line.
- """
- if possible_bad_char:
- lines = text.split('\n')
- text = ''
- for line in lines:
- text = text + prepending_string + line + '\n'
- ## ## Code repeated for speed
- ## return "\n;" + text + ";\n"
- ## else:
- ## return "\n;" + text + ";\n"
- #JFD updates 5/23/2006; apparently the text does not always end with an eol.
- if not text.endswith('\n'):
- text = text + '\n'
- return "\n;\n" + text + ";\n"
- def comments_strip( text ):
- """
- Strip the STAR comments new style
- """
- lines = text.split( "\n" )
- i=0
- # count = 0
- ls = len(lines)
- # print "DEBUG: processing lines: ", ls
- while i<ls:
- # print "DEBUG: processing A line: ", i
- line = lines[i]
- # Scan past semi colon blocks.
- n = len(line)
- if n < 1:
- # print "DEBUG: skipping empty line: "
- i += 1
- continue
- if line[0] == ';': # start a semicolon block
- # print "DEBUG: found start of semi colon block."
- i += 1
- line = lines[i]
- # print "DEBUG: processing B line: ", i
- while len(line)==0 or line[0] != ';':
- i += 1
- line = lines[i]
- # print "DEBUG: processing C line: ", i
- # end a semicolon block
- else:
- line = _comments_strip_line(line)
- if len(line) != n:
- lines[i] = line
- # print "Changed from lenght",n,"to line: ["+line+"] at:", i
- # count += 1
- i += 1
- # if verbosity >= 9:
- # nTdebug( 'Done [%s] comment subs' % count )
- if lines:
- text = "\n".join(lines)
- # text = lines.join("\n")
- return text
- def _comments_strip_line( line ):
- """
- Strip the STAR comments for a single line.
- """
- c=0
- state = FREE # like to start out free which is possible after donning semicolon blocks.
- n = len(line)
- while c < n: # parse range [0,n> where n is length and exclusive.
- ch=line[c]
- # print "DEBUG: Processing char '"+ch+"' at "+repr(c)+" in state:", state
- if ( ch == sharp and state == FREE and # A sharp in FREE state
- (c==0 or line[c-1].isspace())): # behind a space or at beginning of a line.
- # print "DEBUG: Found sharpie"
- if c==0:
- return ''
- return line[0:c] # this is fast.
- if c==n-1: # c is the last character; leave it alone if it's not a sharpie
- return line
- if ch == doubleq:
- if (state == FREE and # new " behind space or at beginning of line
- (c==0 or line[c-1].isspace())):
- state = DOUBLE
- elif state == DOUBLE:
- if line[c+1].isspace(): # garanteed to exist now.
- state = FREE
- elif ch == singleq:
- if (state == FREE and
- (c==0 or line[c-1].isspace())):
- state = SINGLE
- elif state == SINGLE:
- if line[c+1].isspace():
- state = FREE
- c += 1
- return line
- def isStarNan(starValue):
- if NULL_STRING_DOT == starValue:
- return True
- # if '?' == starValue:
- # return True
- if None == starValue:
- return True
- return False
- def translateStarNanToNone(starValue):
- if isStarNan(starValue):
- return None
- return starValue
- #def comments_stripOld( text ):
- # # split for profiling
- # text = _comments_strip1(text)
- # text = _comments_strip2(text)
- # return text
- #
- #def _comments_strip1( text ):
- # text, count = pattern_comment_begin.subn( '', text )
- # if verbosity >= 9:
- # print 'Done [%s] subs with comment at beginning of line' % count
- # return text
- #
- #def _comments_strip2( text ):
- # text, count = pattern_comment_middle.subn( '\g<1>', text )
- # if verbosity >= 9:
- # print 'Done [%s] subs with comment not at beginning of line' % count
- # return text
- def nmrView_compress( text ):
- text, count = pattern_nmrView_compress_empty.subn( '{}', text )
- print 'Compressed [%s] nmrView empty { } tags' % count
- text, count = pattern_nmrView_compress_questionmark.subn( '{?}', text )
- print 'Compressed [%s] nmrView question mark { ?} tags' % count
- return text