TEES /TEES/Tools/ProcessUtils.py

Language Python Lines 270
MD5 Hash c2ec9d5680d02dec2ddc26cdfe2a172a
Repository https://bitbucket.org/yumyai/tees.git View Raw File
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import sys, os, codecs, time, signal
sys.path.append(os.path.dirname(os.path.abspath(__file__))+"/..")
from TEES.Utils.ProgressCounter import ProgressCounter

try:
    import xml.etree.cElementTree as ET
except ImportError:
    import cElementTree as ET

class ProcessWrapper:
    """
    Killing a process spawned by a shell is not really possible (at least in Python).
    This becomes a problem, if a tool requires multiple (e.g. piped) processes to be
    ran. With ProcessWrapper, all processes can be called directly from Python so
    that their ids are known and they can be killed if they hang. A ProcessWrapper can
    be passed as a parameter to ProcessUtils functions in place of a subprocess.Popen
    object. 
    """
    def __init__(self, processes):
        self.processes = processes # subprocesses
    
    def kill(self):
        """
        Kill all subprocesses
        """
        for process in self.processes:
            try:
                process.kill()
            except:
                pass
        for process in self.processes:
            poll = process.poll()
            #print poll
            while poll is None:
                poll = process.poll()
                time.sleep(1)
            #print poll
    
    def poll(self):
        """
        If any subprocess is running, returns None (not finished).
        """
        for process in self.processes:
            if process.poll() is None:
                return None
        return "FINISHED"

def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None):
    """
    Waits for a process to finish, and tracks the number of entities it writes
    to it's outputfile. If writing a sentence takes longer than the timeout, 
    the process is considered stalled and is killed.
    """
    maxStartupTime = 600 # Give extra time for the process to start up (even if it creates immediately an empty output file)
    counter = ProgressCounter(numCorpusSentences, counterName)
    counter.showMilliseconds = True
    prevNumSentences = 0 # Number of output sentences on previous check
    finalCheckLeft = True # Make one final check to update counters
    processStatus = None # When None, process not finished
    prevTime = time.time()
    startTime = time.time()
    # Wait until process is finished and periodically check it's progress.
    while processStatus is None or finalCheckLeft:
        if processStatus is not None: # Extra loop to let counters finish
            finalCheckLeft = False # Done only once
        if os.path.exists(outputFile[0]): # Output file has already appeared on disk
            # Measure number of sentences in output file
            numSentences = 0
            f = codecs.open(outputFile[0], "rt", **outputFile[1])
            for line in f:
                if measureByGap:
                    if line.strip() == "":
                        numSentences += 1
                else:
                    numSentences += 1
            f.close()
            # Update status
            if numSentences - prevNumSentences != 0: # Process has progressed
                counter.update(numSentences - prevNumSentences, updateMessage + ": ")
            if finalCheckLeft: # This is a normal loop, not the final check
                # Startuptime hasn't yet passed or process has made progress
                if time.time() - startTime < maxStartupTime or numSentences - prevNumSentences != 0:
                #if prevNumSentences == 0 or numSentences - prevNumSentences != 0:
                    prevTime = time.time() # reset timeout
                else: # Nothing happened on this update, check whether process hung
                    elapsedTime = time.time() - prevTime
                    if timeout is not None and elapsedTime > timeout:
                        print >> sys.stderr, "Process timed out (" + str(elapsedTime) + " vs. " + str(timeout) + ")"
                        print >> sys.stderr, "Killing process"
                        process.kill()
                prevNumSentences = numSentences
                time.sleep(1)
        else: # Output file doesn't exist yet
            prevTime = time.time() # reset counter if output file hasn't been created
        processStatus = process.poll() # Get process status, None == still running
    
    counter.markFinished() # If we get this far, don't show the error message even if process didn't finish
    return (numSentences, numCorpusSentences)

def makeSubset(input, workdir, fromLine):
    """
    Make a subset of the input data from "fromLine" to end of input file.
    """
    newInput = os.path.join(workdir, "input-from-" + str(fromLine))
    newInputFile = codecs.open(newInput, "wt", "utf-8")

    inputFile = codecs.open(input, "rt", "utf-8")
    lineCount = -1
    for line in inputFile:
        lineCount += 1
        if lineCount < fromLine:
            continue
        newInputFile.write(line)  
    inputFile.close()
    newInputFile.close()
    return newInput

def mergeOutput(dir, numCorpusSentences, measureByGap, outputArgs={}):
    """
    Merge output files (multiple files may have been created if program failed on a sentence)
    """
    filenames = os.listdir(dir)
    outputs = []
    for filename in filenames:
        if filename.find("output-from") != -1:
            outputs.append( (int(filename.rsplit("-", 1)[-1]), filename) )
    outputs.sort() # Order output sets by their first sentence index
    #print outputs
    
    mergedOutput = codecs.open(os.path.join(dir, "merged-output"), "wt", **outputArgs)
    
    missingSentences = 0
    numSentences = 0
    # Go through output subsets in order
    for i in range(len(outputs)):
        f = codecs.open(os.path.join(dir, outputs[i][1]), "rt", **outputArgs)
        for line in f: # Copy to merged file
            mergedOutput.write(line)
            if measureByGap:
                if line.strip() == "":
                    numSentences += 1
            else:
                numSentences += 1
        f.close()
        # If sentences are missing from output, write empty lines in merged output
        if i < len(outputs) - 1: # not last output
            while numSentences < outputs[i+1][0]: # Start of next subset not reached yet
                mergedOutput.write("\n")
                numSentences += 1
                missingSentences += 1
        else: # last of the output subsets
            while numSentences < numCorpusSentences: # End of whole data not reached yet
                mergedOutput.write("\n")
                numSentences += 1
                missingSentences += 1
    mergedOutput.close()
    return missingSentences

def getSubsetEndPos(subsetFileName, measureByGap):
    """
    Return the sentence count to which this process reached by counting
    the sentences in the output file.
    """
    if subsetFileName.find("-from-") == -1:
        return 0
    numSentences = getLines(subsetFileName, measureByGap)
    subsetPos = int(subsetFileName.rsplit("-", 1)[-1])
    return subsetPos + numSentences

def getLines(filename, measureByGap):
    """
    Number of sentences in the file, measured either in lines, or by empty "gap" lines
    """
    numSentences = 0
    f = codecs.open(filename, "rt", "utf-8")
    for line in f:
        if measureByGap:
            if line.strip() == "":
                numSentences += 1
        else:
            numSentences += 1
    f.close()
    return numSentences

def runSentenceProcess(launchProcess, programDir, input, workdir, measureByGap, counterName, updateMessage, timeout=None, processArgs={}, outputArgs={}):
    """
    Runs a process on input sentences, and in case of problems skips one sentence and 
    reruns the process on the remaining ones.
    """
    # Count input sentences
    input = os.path.abspath(input)
    origInput = input
    numCorpusSentences = 0
    inputFile = codecs.open(input, "rt", "utf-8")
    for line in inputFile:
        numCorpusSentences += 1
    inputFile.close()
    
    if "encoding" not in outputArgs:
        outputArgs["encoding"] = "utf-8"
    
    cwd = os.getcwd()
    os.chdir(programDir)
    finished = False
    startLine = 0
    while not finished:
        # Count lines in input file (input data must be in a one sentence per line -format)
        inputLines = 0
        inputFile = codecs.open(input, "rt", "utf-8")
        for line in inputFile:
            inputLines += 1
        inputFile.close()

        output = os.path.join(workdir, "output-from-" + str(startLine))
        process = launchProcess(input, output, **processArgs)
        result = waitForProcess(process, inputLines, measureByGap, (output, outputArgs), counterName, updateMessage, timeout)
        if result[0] != result[1]:
            gap = 1
            startLine = getSubsetEndPos(output, measureByGap) + gap 
            if startLine >= numCorpusSentences:
                finished = True
            else:
                print >> sys.stderr, "Process failed for sentence " + str(startLine-gap) + ", rerunning from sentence", startLine
                input = makeSubset(origInput, workdir, startLine)
        else:
            finished = True
    os.chdir(cwd)
    
    numMissedSentences = mergeOutput(workdir, numCorpusSentences, measureByGap, outputArgs=outputArgs)
    if numMissedSentences == 0:
        print >> sys.stderr, "Processed succesfully all sentences"
    else:
        print >> sys.stderr, "Warning, processing failed for", numMissedSentences, "out of", numCorpusSentences, "sentences"
    return os.path.abspath(os.path.join(workdir, "merged-output"))

def getElementIndex(parent, element):
    index = 0
    for e in parent:
        if e == element:
            return index
        index += 1
    return -1

def getPrevElementIndex(parent, eTag):
    index = 0
    elemIndex = -1
    for element in parent:
        if element.tag == eTag:
            elemIndex = index
        index += 1
    return elemIndex

def getElementByAttrib(parent, tag, attDict):
    for element in parent.getiterator():
        if element.tag == tag:
            found = True
            for k, v in attDict.iteritems():
                if element.get(k) != v:
                    found = False
            if found:
                return element
    return None

def setDefaultElement(parent, name):
    element = parent.find(name)
    if element is None:
        element = ET.Element(name)
        parent.append(element)
    return element
Back to Top