vafromb.py - This Python script resynthesizes a video by ma…

/examples/videx/vafromb.py

http://echo-nest-remix.googlecode.com/ · Python · 193 lines · 161 code · 12 blank · 20 comment · 31 complexity · b465171b59eb38d4ca53b2132fa645f7 MD5 · raw file

#!/usr/bin/env python
# encoding: utf=8

"""
vafromb.py

Re-synthesize video A using the segments of video B.

By Ben Lacker, 2009-02-24.
"""
import numpy
import sys
import time

from echonest import action, audio, video

usage="""
Usage:
    python vafromb.py <inputfilenameA> <inputfilenameB> <outputfilename> <Mix> [env]

Example:
    python vafromb.py BillieJean.mp4 CryMeARiver.mp4 BillieJeanFromCryMeARiver.mp4 0.9 env

The 'env' flag applies the volume envelopes of the segments of A to those
from B.

Mix is a number 0-1 that determines the relative mix of the resynthesized
song and the original input A. i.e. a mix value of 0.9 yields an output that
is mostly the resynthesized version.

"""

class AfromB(object):
    def __init__(self, input_filename_a, input_filename_b, output_filename):
        "Synchronizes slavebundle on masterbundle, writes to outbundle"
        self.master = video.loadav(input_filename_a)
        # convert slave so it matches master's settings
        converted = video.convertmov(input_filename_b, settings=self.master.video.settings)
        self.slave = video.loadav(converted)
        self.out = output_filename
        
        self.input_a = self.master.audio
        self.input_b = self.slave.audio
        self.segs_a = self.input_a.analysis.segments
        self.segs_b = self.input_b.analysis.segments
        self.output_filename = output_filename
    
    def calculate_distances(self, a):
        distance_matrix = numpy.zeros((len(self.segs_b), 4), dtype=numpy.float32)
        pitch_distances = []
        timbre_distances = []
        loudmax_distances = []
        for b in self.segs_b:
            pitch_diff = numpy.subtract(b.pitches,a.pitches)
            pitch_distances.append(numpy.sum(numpy.square(pitch_diff)))
            timbre_diff = numpy.subtract(b.timbre,a.timbre)
            timbre_distances.append(numpy.sum(numpy.square(timbre_diff)))
            loudmax_diff = b.loudness_begin - a.loudness_begin
            loudmax_distances.append(numpy.square(loudmax_diff))
        distance_matrix[:,0] = pitch_distances
        distance_matrix[:,1] = timbre_distances
        distance_matrix[:,2] = loudmax_distances
        distance_matrix[:,3] = range(len(self.segs_b))
        distance_matrix = self.normalize_distance_matrix(distance_matrix)
        return distance_matrix
    
    def normalize_distance_matrix(self, mat, mode='minmed'):
        """ Normalize a distance matrix on a per column basis.
        """
        if mode == 'minstd':
            mini = numpy.min(mat,0)
            m = numpy.subtract(mat, mini)
            std = numpy.std(mat,0)
            m = numpy.divide(m, std)
            m = numpy.divide(m, mat.shape[1])
        elif mode == 'minmed':
            mini = numpy.min(mat,0)
            m = numpy.subtract(mat, mini)
            med = numpy.median(m)
            m = numpy.divide(m, med)
            m = numpy.divide(m, mat.shape[1])
        elif mode == 'std':
            std = numpy.std(mat,0)
            m = numpy.divide(mat, std)
            m = numpy.divide(m, mat.shape[1])
        return m
    
    def run(self, mix=0.5, envelope=False):
        dur = len(self.input_a.data) + 100000 # another two seconds
        # determine shape of new array. 
        # do everything in mono; I'm not fancy.
        new_shape = (dur,)
        new_channels = 1
        self.input_a = action.make_mono(self.input_a)
        self.input_b = action.make_mono(self.input_b)
        out = audio.AudioData(shape=new_shape, sampleRate=self.input_b.sampleRate, numChannels=new_channels)
        for a in self.segs_a:
            seg_index = a.absolute_context()[0]
            # find best match from segs in B
            distance_matrix = self.calculate_distances(a)
            distances = [numpy.sqrt(x[0]+x[1]+x[2]) for x in distance_matrix]
            match = self.segs_b[distances.index(min(distances))]
            segment_data = self.input_b[match]
            reference_data = self.input_a[a]
            if segment_data.endindex < reference_data.endindex:
                if new_channels > 1:
                    silence_shape = (reference_data.endindex,new_channels)
                else:
                    silence_shape = (reference_data.endindex,)
                new_segment = audio.AudioData(shape=silence_shape,
                                        sampleRate=out.sampleRate,
                                        numChannels=segment_data.numChannels)
                new_segment.append(segment_data)
                new_segment.endindex = len(new_segment)
                segment_data = new_segment
            elif segment_data.endindex > reference_data.endindex:
                index = slice(0, int(reference_data.endindex), 1)
                segment_data = audio.AudioData(None,segment_data.data[index],
                                        sampleRate=segment_data.sampleRate)

            chopvideo = self.slave.video[match] # get editableframes object
            masterchop = self.master.video[a]
            startframe = self.master.video.indexvoodo(a.start) # find start index
            endframe = self.master.video.indexvoodo(a.start + a.duration)
            for i in xrange(len(chopvideo.files)):
                if startframe+i < len(self.master.video.files):
                    self.master.video.files[startframe+i] = chopvideo.files[i]
            last_frame = chopvideo.files[i]
            for i in xrange(len(chopvideo.files), len(masterchop.files)):
                if startframe+i < len(self.master.video.files):
                    self.master.video.files[startframe+i] = last_frame
                
            if envelope:
                # db -> voltage ratio http://www.mogami.com/e/cad/db.html
                linear_max_volume = pow(10.0,a.loudness_max/20.0)
                linear_start_volume = pow(10.0,a.loudness_begin/20.0)
                if(seg_index == len(self.segs_a)-1): # if this is the last segment
                    linear_next_start_volume = 0
                else:
                    linear_next_start_volume = pow(10.0,self.segs_a[seg_index+1].loudness_begin/20.0)
                    pass
                when_max_volume = a.time_loudness_max
                # Count # of ticks I wait doing volume ramp so I can fix up rounding errors later.
                ss = 0
                # Set volume of this segment. Start at the start volume, ramp up to the max volume , then ramp back down to the next start volume.
                cur_vol = float(linear_start_volume)
                # Do the ramp up to max from start
                samps_to_max_loudness_from_here = int(segment_data.sampleRate * when_max_volume)
                if(samps_to_max_loudness_from_here > 0):
                    how_much_volume_to_increase_per_samp = float(linear_max_volume - linear_start_volume)/float(samps_to_max_loudness_from_here)
                    for samps in xrange(samps_to_max_loudness_from_here):
                        try:
                            segment_data.data[ss] *= cur_vol
                        except IndexError:
                            pass
                        cur_vol = cur_vol + how_much_volume_to_increase_per_samp
                        ss = ss + 1
                # Now ramp down from max to start of next seg
                samps_to_next_segment_from_here = int(segment_data.sampleRate * (a.duration-when_max_volume))
                if(samps_to_next_segment_from_here > 0):
                    how_much_volume_to_decrease_per_samp = float(linear_max_volume - linear_next_start_volume)/float(samps_to_next_segment_from_here)
                    for samps in xrange(samps_to_next_segment_from_here):
                        cur_vol = cur_vol - how_much_volume_to_decrease_per_samp
                        try:
                            segment_data.data[ss] *= cur_vol
                        except IndexError:
                            pass
                        ss = ss + 1
            mixed_data = audio.mix(segment_data,reference_data,mix=mix)
            out.append(mixed_data)
        self.master.audio = out
        self.master.save(self.output_filename)

def main():
    try:
        input_filename_a = sys.argv[1]
        input_filename_b = sys.argv[2]
        output_filename = sys.argv[3]
        mix = sys.argv[4]
        if len(sys.argv) == 6:
            env = True
        else:
            env = False
    except Exception:
        print usage
        sys.exit(-1)
    AfromB(input_filename_a, input_filename_b, output_filename).run(mix=mix, envelope=env)

if __name__=='__main__':
    tic = time.time()
    main()
    toc = time.time()
    print "Elapsed time: %.3f sec" % float(toc-tic)
Summary ✨

This Python script resynthesizes a video by matching segments from two input videos, A and B, based on pitch, timbre, and loudness distances. It then mixes the matched segments with the original video A to produce a new output video, applying volume envelopes if specified. The script takes command-line arguments for input files, output file, and mixing factor.
Tech Fingerprint

Alerts (4)

'def' Ensure functions have docstrings for documentation
48 88 174
'except Exception:' Catch specific exceptions instead of Exception to avoid masking bugs
184