PageRenderTime 40ms CodeModel.GetById 2ms app.highlight 32ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/tool_shed/migrate_tools_to_repositories.py

https://bitbucket.org/cistrome/cistrome-harvard/
Python | 337 lines | 302 code | 2 blank | 33 comment | 22 complexity | f58a55c7426a3ec2dad1f07373290085 MD5 | raw file
  1#!/usr/bin/env python
  2
  3'''
  4Migrate old Galaxy tool shed to next gen Galaxy tool shed.  Specifically, the tool archives stored as
  5files in the old tool shed will be migrated to mercurial repositories in the next gen tool shed.  This
  6script can be run any number of times as it initially eliminates any current repositories and db records
  7associated with them, and migrates old tool shed stuff to new tool shed stuff.
  8
  9====== CRITICAL =======
 10
 110. This script must be run on a repo updated to changeset:   5621:4618be57481b
 12
 131. Before running this script, make sure the following config setting is set in tool_shed_wsgi.ini
 14
 15# Enable next-gen tool shed features
 16enable_next_gen_tool_shed = True
 17
 182. This script requires the Galaxy instance to use Postgres for database storage.  
 19
 20To run this script, use "sh migrate_tools_to_repositories.sh" from this directory
 21'''
 22
 23import sys, os, subprocess, ConfigParser, shutil, tarfile, tempfile
 24
 25assert sys.version_info[:2] >= ( 2, 4 )
 26new_path = [ os.path.join( os.getcwd(), "lib" ) ]
 27new_path.extend( sys.path[1:] ) # remove scripts/ from the path
 28sys.path = new_path
 29
 30from galaxy import eggs
 31import pkg_resources
 32pkg_resources.require( "psycopg2" )
 33import psycopg2
 34
 35import galaxy.webapps.tool_shed.app
 36from mercurial import hg, ui, httprepo, commands
 37from time import strftime
 38
 39def directory_hash_id( id ):
 40    s = str( id )
 41    l = len( s )
 42    # Shortcut -- ids 0-999 go under ../000/
 43    if l < 4:
 44        return [ "000" ]
 45    # Pad with zeros until a multiple of three
 46    padded = ( ( ( 3 - len( s ) ) % 3 ) * "0" ) + s
 47    # Drop the last three digits -- 1000 files per directory
 48    padded = padded[:-3]
 49    # Break into chunks of three
 50    return [ padded[i*3:(i+1)*3] for i in range( len( padded ) // 3 ) ]
 51
 52def get_versions( app, item ):
 53    """Get all versions of item whose state is a valid state"""
 54    valid_states = [ app.model.Tool.states.NEW, 
 55                     app.model.Tool.states.WAITING, 
 56                     app.model.Tool.states.APPROVED, 
 57                     app.model.Tool.states.ARCHIVED ]
 58    versions = [ item ]
 59    this_item = item
 60    while item.newer_version:
 61        if item.newer_version.state in valid_states:
 62            versions.append( item.newer_version )
 63        item = item.newer_version
 64    item = this_item
 65    while item.older_version:
 66        if item.older_version[ 0 ].state in valid_states:
 67            versions.insert( 0, item.older_version[ 0 ] )
 68        item = item.older_version[ 0 ]
 69    return versions
 70
 71def get_approved_tools( app, sa_session ):
 72    """Get only the latest version of each tool from the database whose state is approved"""
 73    tools = []
 74    for tool in sa_session.query( app.model.Tool ) \
 75                          .order_by( app.model.Tool.table.c.name ):
 76        if tool.state == app.model.Tool.states.APPROVED:
 77            tools.append( tool )
 78    return tools
 79
 80def create_repository_from_tool( app, sa_session, tool ):
 81    # Make the repository name a form of the tool's tool_id by
 82    # lower-casing everything and replacing any blank spaces with underscores.
 83    repo_name = tool.tool_id.lower().replace( ' ', '_' )
 84    print "Creating repository '%s' in database" % ( repo_name )
 85    repository = app.model.Repository( name=repo_name,
 86                                       description=tool.description,
 87                                       user_id = tool.user_id )
 88    # Flush to get the id
 89    sa_session.add( repository )
 90    sa_session.flush()
 91    # Determine the local repository's path on disk
 92    dir = os.path.join( app.config.file_path, *directory_hash_id( repository.id ) )
 93    # Create directory if it does not exist
 94    if not os.path.exists( dir ):
 95        os.makedirs( dir )
 96    # Define repository name inside hashed directory
 97    repository_path = os.path.join( dir, "repo_%d" % repository.id )
 98    # Create repository directory
 99    if not os.path.exists( repository_path ):
100        os.makedirs( repository_path )
101    # Create the local hg repository
102    print "Creating repository '%s' on disk" % ( os.path.abspath( repository_path ) )
103    repo = hg.repository( ui.ui(), os.path.abspath( repository_path ), create=True )
104    # Add an entry in the hgweb.config file for the new repository - this enables calls to repository.repo_path
105    add_hgweb_config_entry( repository, repository_path )
106    # Migrate tool categories
107    for tca in tool.categories:
108        category = tca.category
109        print "Associating category '%s' with repository '%s' in database" % ( category.name, repository.name )
110        rca = app.model.RepositoryCategoryAssociation( repository, category )
111        sa_session.add( rca )
112    sa_session.flush()
113    # Migrate tool ratings
114    print "Associating ratings for tool '%s' with repository '%s'" % ( tool.name, repository.name )
115    for tra in tool.ratings:
116        rra = app.model.RepositoryRatingAssociation( user=tra.user,
117                                                     rating=tra.rating,
118                                                     comment=tra.comment )
119        rra.repository=repository
120        sa_session.add( rra )
121    sa_session.flush()
122
123def add_hgweb_config_entry( repository, repository_path ):
124    # Add an entry in the hgweb.config file for a new repository.  This enables calls to repository.repo_path.
125    # An entry looks something like: repos/test/mira_assembler = database/community_files/000/repo_123
126    hgweb_config = "%s/hgweb.config" %  os.getcwd()
127    entry = "repos/%s/%s = %s" % ( repository.user.username, repository.name, repository_path.lstrip( './' ) )
128    if os.path.exists( hgweb_config ):
129        output = open( hgweb_config, 'a' )
130    else:
131        output = open( hgweb_config, 'w' )
132        output.write( '[paths]\n' )
133    output.write( "%s\n" % entry )
134    output.close()
135
136def create_hgrc_file( repository ):
137    # At this point, an entry for the repository is required to be in the hgweb.config
138    # file so we can call repository.repo_path.
139    # Create a .hg/hgrc file that looks something like this:
140    # [web]
141    # allow_push = test
142    # name = convert_characters1
143    # push_ssl = False
144    # Upon repository creation, only the owner can push to it ( allow_push setting ),
145    # and since we support both http and https, we set push_ssl to False to override
146    # the default (which is True) in the mercurial api.
147    hgrc_file = os.path.abspath( os.path.join( repository.repo_path, ".hg", "hgrc" ) )
148    output = open( hgrc_file, 'w' )
149    output.write( '[web]\n' )
150    output.write( 'allow_push = %s\n' % repository.user.username )
151    output.write( 'name = %s\n' % repository.name )
152    output.write( 'push_ssl = false\n' )
153    output.flush()
154    output.close()
155
156def add_tool_files_to_repository( app, sa_session, tool ):
157    current_working_dir = os.getcwd()
158    # Get the repository to which the tool will be migrated
159    repo_name = tool.tool_id.lower().replace( ' ', '_' )
160    repository = get_repository_by_name( app, sa_session, repo_name )
161    repo_path = os.path.abspath( repository.repo_path )
162    # Get all valid versions of the tool
163    tool_versions = get_versions( app, tool )
164    for tool_version in tool_versions:
165        print "------------------------------"
166        print "Migrating tool '%s' version '%s' from archive to repository '%s'" % ( tool_version.tool_id, tool_version.version, repo_path )
167        # Make a temporary working directory
168        tmp_dir = tempfile.mkdtemp()
169        tmp_archive_dir = os.path.join( tmp_dir, 'tmp_archive_dir' )
170        if not os.path.exists( tmp_archive_dir ):
171            os.makedirs( tmp_archive_dir )
172        cmd = "hg clone %s" % repo_path
173        os.chdir( tmp_archive_dir )
174        os.system( cmd )
175        os.chdir( current_working_dir )        
176        cloned_repo_dir = os.path.join( tmp_archive_dir, 'repo_%d' % repository.id )
177        # We want these change sets to be associated with the owner of the repository, so we'll
178        # set the HGUSER environment variable accordingly.  We do this because in the mercurial
179        # api, the default username to be used in commits is determined in this order: $HGUSER,
180        # [ui] section of hgrcs, $EMAIL and stop searching if one of these is set.
181        os.environ[ 'HGUSER' ] = repository.user.username
182        # Copy the tool archive to the tmp_archive_dir.  The src file cannot be derived from
183        # tool.file_name here because we have not loaded the Tool class in the model, so the
184        # tool.file_name defaults to /tmp/...
185        dir = os.path.join( app.config.file_path, 'tools', *directory_hash_id( tool_version.id ) )
186        src = os.path.abspath( os.path.join( dir, 'tool_%d.dat' % tool_version.id ) )
187        dst = os.path.join( tmp_archive_dir, tool_archive_file_name( tool_version, src ) )
188        shutil.copy( src, dst )
189        # Extract the archive to cloned_repo_dir
190        tarfile.open( dst ).extractall( path=cloned_repo_dir )
191        # Remove the archive
192        os.remove( dst )
193        # Change current working directory to the cloned repository
194        os.chdir( cloned_repo_dir )
195        for root, dirs, files in os.walk( cloned_repo_dir ):
196            if '.hg' in dirs:
197                # Don't visit .hg directories
198                dirs.remove( '.hg' )
199            if 'hgrc' in files:
200                 # Don't include hgrc files in commit - should be impossible
201                 # since we don't visit .hg dirs, but just in case...
202                files.remove( 'hgrc' )
203            for dir in dirs:
204                os.system( "hg add %s" % dir )
205            for name in files:
206                print "Adding file '%s' to cloned repository at %s" % ( name, str( os.getcwd() ) )
207                os.system( "hg add %s" % name )
208        print "Committing change set to cloned repository at %s" % str( os.getcwd() )
209        os.system( "hg commit -m 'Migrated tool version %s from old tool shed archive to new tool shed repository'" % tool_version.version )
210        print "Pushing changeset from cloned repository '%s' to repository '%s'" % ( cloned_repo_dir, repo_path )
211        cmd = "hg push %s" % repo_path
212        print "cmd is: ", cmd
213        os.system( cmd )
214        # The tool shed includes a repository source file browser, which currently depends upon
215        # copies of the hg repository file store in the repo_path for browsing.  We'll do the
216        # following to make these copies.
217        os.chdir( repo_path )
218        os.system( 'hg update' )
219        # Change the current working directory to the original
220        os.chdir( current_working_dir )
221        # Now that we have out new repository made current with all change sets,
222        # we'll create a hgrc file for it.
223        create_hgrc_file( repository )
224        # Remove tmp directory
225        shutil.rmtree( tmp_dir )
226
227def get_repository_by_name( app, sa_session, repo_name ):
228    """Get a repository from the database"""
229    return sa_session.query( app.model.Repository ).filter_by( name=repo_name ).one()
230
231def contains( containing_str, contained_str ):
232    return containing_str.lower().find( contained_str.lower() ) >= 0
233
234def tool_archive_extension( file_name ):
235    extension = None
236    if extension is None:
237        head = open( file_name, 'rb' ).read( 4 )
238        try:
239            assert head[:3] == 'BZh'
240            assert int( head[-1] ) in range( 0, 10 )
241            extension = 'tar.bz2'
242        except AssertionError:
243            pass
244    if extension is None:
245        try:
246            assert head[:2] == '\037\213'
247            extension = 'tar.gz'
248        except:
249            pass
250    if extension is None:
251        extension = 'tar'
252    return extension
253
254def tool_archive_file_name( tool, file_name ):
255    return '%s_%s.%s' % ( tool.tool_id, tool.version, tool_archive_extension( file_name ) )
256    
257def main():
258    if len( sys.argv ) < 2:
259        print "Usage: python %s <Tool shed config file>" % sys.argv[0]
260        sys.exit( 0 )
261    now = strftime( "%Y-%m-%d %H:%M:%S" )
262    print " "
263    print "##########################################"
264    print "%s - Migrating current tool archives to new tool repositories" % now
265    # tool_shed_wsgi.ini file
266    ini_file = sys.argv[1]
267    conf_parser = ConfigParser.ConfigParser( {'here':os.getcwd()} )
268    conf_parser.read( ini_file )
269    try:
270        db_conn_str = conf_parser.get( "app:main", "database_connection" )
271    except ConfigParser.NoOptionError, e:
272        db_conn_str = conf_parser.get( "app:main", "database_file" )
273    print 'DB Connection: ', db_conn_str
274    # Determine db connection - only postgres is supported
275    if contains( db_conn_str, '///' ) and contains( db_conn_str, '?' ) and contains( db_conn_str, '&' ):
276        # postgres:///galaxy_test?user=postgres&password=postgres 
277        db_str = db_conn_str.split( '///' )[1]
278        db_name = db_str.split( '?' )[0]
279        db_user = db_str.split( '?' )[1].split( '&' )[0].split( '=' )[1]
280        db_password = db_str.split( '?' )[1].split( '&' )[1].split( '=' )[1]
281    elif contains( db_conn_str, '//' ) and contains( db_conn_str, ':' ):
282        # dialect://user:password@host/db_name
283        db_name = db_conn_str.split('/')[-1]
284        db_user = db_conn_str.split('//')[1].split(':')[0]
285    # Instantiate app
286    configuration = {}
287    for key, value in conf_parser.items( "app:main" ):
288        configuration[key] = value
289    app = galaxy.webapps.tool_shed.app.UniverseApplication( global_conf=dict( __file__=ini_file ), **configuration )
290    sa_session = app.model.context
291    # Remove the hgweb.config file if it exists
292    hgweb_config = "%s/hgweb.config" %  os.getcwd()
293    if os.path.exists( hgweb_config ):
294        print "Removing old file: ", hgweb_config
295        os.remove( hgweb_config )
296    repo_records = 0
297    rca_records = 0
298    rra_records = 0
299    for repo in sa_session.query( app.model.Repository ):
300        # Remove the hg repository from disk.  We have to be careful here, because old
301        # tool files exist in app.config.file_path/tools and we don't want to delete them
302        dir = os.path.join( app.config.file_path, *directory_hash_id( repo.id ) )
303        if os.path.exists( dir ):
304            print "Removing old repository file directory: ", dir
305            shutil.rmtree( dir )
306        # Delete all records from db tables: 
307        # repository_category_association, repository_rating_association, repository
308        print "Deleting db records for repository: ", repo.name
309        for rca in repo.categories:
310            sa_session.delete( rca )
311            rca_records += 1
312        for rra in repo.ratings:
313            sa_session.delete( rra )
314            rra_records += 1
315        sa_session.delete( repo )
316        repo_records += 1
317    sa_session.flush()
318    print "Deleted %d rows from the repository table" % repo_records
319    print "Deleted %d rows from the repository_category_association table" % rca_records
320    print "Deleted %d rows from the repository_rating_association table" % rra_records
321    # Migrate database tool, tool category and tool rating records to new 
322    # database repository, repository category and repository rating records
323    # and create the hg repository on disk for each.
324    for tool in get_approved_tools( app, sa_session ):
325        create_repository_from_tool( app, sa_session, tool )
326    # Add, commit and push all valid versions of each approved tool to the
327    # associated hg repository.
328    for tool in get_approved_tools( app, sa_session ):
329        add_tool_files_to_repository( app, sa_session, tool )
330    app.shutdown()
331    print ' '
332    print 'Migration to next gen tool shed complete...'
333    print "##########################################"
334    sys.exit(0)
335
336if __name__ == "__main__":
337    main()