/scripts/tool_shed/migrate_tools_to_repositories.py
https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 337 lines · 318 code · 1 blank · 18 comment · 0 complexity · f58a55c7426a3ec2dad1f07373290085 MD5 · raw file
- #!/usr/bin/env python
- '''
- Migrate old Galaxy tool shed to next gen Galaxy tool shed. Specifically, the tool archives stored as
- files in the old tool shed will be migrated to mercurial repositories in the next gen tool shed. This
- script can be run any number of times as it initially eliminates any current repositories and db records
- associated with them, and migrates old tool shed stuff to new tool shed stuff.
- ====== CRITICAL =======
- 0. This script must be run on a repo updated to changeset: 5621:4618be57481b
- 1. Before running this script, make sure the following config setting is set in tool_shed_wsgi.ini
- # Enable next-gen tool shed features
- enable_next_gen_tool_shed = True
- 2. This script requires the Galaxy instance to use Postgres for database storage.
- To run this script, use "sh migrate_tools_to_repositories.sh" from this directory
- '''
- import sys, os, subprocess, ConfigParser, shutil, tarfile, tempfile
- assert sys.version_info[:2] >= ( 2, 4 )
- new_path = [ os.path.join( os.getcwd(), "lib" ) ]
- new_path.extend( sys.path[1:] ) # remove scripts/ from the path
- sys.path = new_path
- from galaxy import eggs
- import pkg_resources
- pkg_resources.require( "psycopg2" )
- import psycopg2
- import galaxy.webapps.tool_shed.app
- from mercurial import hg, ui, httprepo, commands
- from time import strftime
- def directory_hash_id( id ):
- s = str( id )
- l = len( s )
- # Shortcut -- ids 0-999 go under ../000/
- if l < 4:
- return [ "000" ]
- # Pad with zeros until a multiple of three
- padded = ( ( ( 3 - len( s ) ) % 3 ) * "0" ) + s
- # Drop the last three digits -- 1000 files per directory
- padded = padded[:-3]
- # Break into chunks of three
- return [ padded[i*3:(i+1)*3] for i in range( len( padded ) // 3 ) ]
- def get_versions( app, item ):
- """Get all versions of item whose state is a valid state"""
- valid_states = [ app.model.Tool.states.NEW,
- app.model.Tool.states.WAITING,
- app.model.Tool.states.APPROVED,
- app.model.Tool.states.ARCHIVED ]
- versions = [ item ]
- this_item = item
- while item.newer_version:
- if item.newer_version.state in valid_states:
- versions.append( item.newer_version )
- item = item.newer_version
- item = this_item
- while item.older_version:
- if item.older_version[ 0 ].state in valid_states:
- versions.insert( 0, item.older_version[ 0 ] )
- item = item.older_version[ 0 ]
- return versions
- def get_approved_tools( app, sa_session ):
- """Get only the latest version of each tool from the database whose state is approved"""
- tools = []
- for tool in sa_session.query( app.model.Tool ) \
- .order_by( app.model.Tool.table.c.name ):
- if tool.state == app.model.Tool.states.APPROVED:
- tools.append( tool )
- return tools
- def create_repository_from_tool( app, sa_session, tool ):
- # Make the repository name a form of the tool's tool_id by
- # lower-casing everything and replacing any blank spaces with underscores.
- repo_name = tool.tool_id.lower().replace( ' ', '_' )
- print "Creating repository '%s' in database" % ( repo_name )
- repository = app.model.Repository( name=repo_name,
- description=tool.description,
- user_id = tool.user_id )
- # Flush to get the id
- sa_session.add( repository )
- sa_session.flush()
- # Determine the local repository's path on disk
- dir = os.path.join( app.config.file_path, *directory_hash_id( repository.id ) )
- # Create directory if it does not exist
- if not os.path.exists( dir ):
- os.makedirs( dir )
- # Define repository name inside hashed directory
- repository_path = os.path.join( dir, "repo_%d" % repository.id )
- # Create repository directory
- if not os.path.exists( repository_path ):
- os.makedirs( repository_path )
- # Create the local hg repository
- print "Creating repository '%s' on disk" % ( os.path.abspath( repository_path ) )
- repo = hg.repository( ui.ui(), os.path.abspath( repository_path ), create=True )
- # Add an entry in the hgweb.config file for the new repository - this enables calls to repository.repo_path
- add_hgweb_config_entry( repository, repository_path )
- # Migrate tool categories
- for tca in tool.categories:
- category = tca.category
- print "Associating category '%s' with repository '%s' in database" % ( category.name, repository.name )
- rca = app.model.RepositoryCategoryAssociation( repository, category )
- sa_session.add( rca )
- sa_session.flush()
- # Migrate tool ratings
- print "Associating ratings for tool '%s' with repository '%s'" % ( tool.name, repository.name )
- for tra in tool.ratings:
- rra = app.model.RepositoryRatingAssociation( user=tra.user,
- rating=tra.rating,
- comment=tra.comment )
- rra.repository=repository
- sa_session.add( rra )
- sa_session.flush()
- def add_hgweb_config_entry( repository, repository_path ):
- # Add an entry in the hgweb.config file for a new repository. This enables calls to repository.repo_path.
- # An entry looks something like: repos/test/mira_assembler = database/community_files/000/repo_123
- hgweb_config = "%s/hgweb.config" % os.getcwd()
- entry = "repos/%s/%s = %s" % ( repository.user.username, repository.name, repository_path.lstrip( './' ) )
- if os.path.exists( hgweb_config ):
- output = open( hgweb_config, 'a' )
- else:
- output = open( hgweb_config, 'w' )
- output.write( '[paths]\n' )
- output.write( "%s\n" % entry )
- output.close()
- def create_hgrc_file( repository ):
- # At this point, an entry for the repository is required to be in the hgweb.config
- # file so we can call repository.repo_path.
- # Create a .hg/hgrc file that looks something like this:
- # [web]
- # allow_push = test
- # name = convert_characters1
- # push_ssl = False
- # Upon repository creation, only the owner can push to it ( allow_push setting ),
- # and since we support both http and https, we set push_ssl to False to override
- # the default (which is True) in the mercurial api.
- hgrc_file = os.path.abspath( os.path.join( repository.repo_path, ".hg", "hgrc" ) )
- output = open( hgrc_file, 'w' )
- output.write( '[web]\n' )
- output.write( 'allow_push = %s\n' % repository.user.username )
- output.write( 'name = %s\n' % repository.name )
- output.write( 'push_ssl = false\n' )
- output.flush()
- output.close()
- def add_tool_files_to_repository( app, sa_session, tool ):
- current_working_dir = os.getcwd()
- # Get the repository to which the tool will be migrated
- repo_name = tool.tool_id.lower().replace( ' ', '_' )
- repository = get_repository_by_name( app, sa_session, repo_name )
- repo_path = os.path.abspath( repository.repo_path )
- # Get all valid versions of the tool
- tool_versions = get_versions( app, tool )
- for tool_version in tool_versions:
- print "------------------------------"
- print "Migrating tool '%s' version '%s' from archive to repository '%s'" % ( tool_version.tool_id, tool_version.version, repo_path )
- # Make a temporary working directory
- tmp_dir = tempfile.mkdtemp()
- tmp_archive_dir = os.path.join( tmp_dir, 'tmp_archive_dir' )
- if not os.path.exists( tmp_archive_dir ):
- os.makedirs( tmp_archive_dir )
- cmd = "hg clone %s" % repo_path
- os.chdir( tmp_archive_dir )
- os.system( cmd )
- os.chdir( current_working_dir )
- cloned_repo_dir = os.path.join( tmp_archive_dir, 'repo_%d' % repository.id )
- # We want these change sets to be associated with the owner of the repository, so we'll
- # set the HGUSER environment variable accordingly. We do this because in the mercurial
- # api, the default username to be used in commits is determined in this order: $HGUSER,
- # [ui] section of hgrcs, $EMAIL and stop searching if one of these is set.
- os.environ[ 'HGUSER' ] = repository.user.username
- # Copy the tool archive to the tmp_archive_dir. The src file cannot be derived from
- # tool.file_name here because we have not loaded the Tool class in the model, so the
- # tool.file_name defaults to /tmp/...
- dir = os.path.join( app.config.file_path, 'tools', *directory_hash_id( tool_version.id ) )
- src = os.path.abspath( os.path.join( dir, 'tool_%d.dat' % tool_version.id ) )
- dst = os.path.join( tmp_archive_dir, tool_archive_file_name( tool_version, src ) )
- shutil.copy( src, dst )
- # Extract the archive to cloned_repo_dir
- tarfile.open( dst ).extractall( path=cloned_repo_dir )
- # Remove the archive
- os.remove( dst )
- # Change current working directory to the cloned repository
- os.chdir( cloned_repo_dir )
- for root, dirs, files in os.walk( cloned_repo_dir ):
- if '.hg' in dirs:
- # Don't visit .hg directories
- dirs.remove( '.hg' )
- if 'hgrc' in files:
- # Don't include hgrc files in commit - should be impossible
- # since we don't visit .hg dirs, but just in case...
- files.remove( 'hgrc' )
- for dir in dirs:
- os.system( "hg add %s" % dir )
- for name in files:
- print "Adding file '%s' to cloned repository at %s" % ( name, str( os.getcwd() ) )
- os.system( "hg add %s" % name )
- print "Committing change set to cloned repository at %s" % str( os.getcwd() )
- os.system( "hg commit -m 'Migrated tool version %s from old tool shed archive to new tool shed repository'" % tool_version.version )
- print "Pushing changeset from cloned repository '%s' to repository '%s'" % ( cloned_repo_dir, repo_path )
- cmd = "hg push %s" % repo_path
- print "cmd is: ", cmd
- os.system( cmd )
- # The tool shed includes a repository source file browser, which currently depends upon
- # copies of the hg repository file store in the repo_path for browsing. We'll do the
- # following to make these copies.
- os.chdir( repo_path )
- os.system( 'hg update' )
- # Change the current working directory to the original
- os.chdir( current_working_dir )
- # Now that we have out new repository made current with all change sets,
- # we'll create a hgrc file for it.
- create_hgrc_file( repository )
- # Remove tmp directory
- shutil.rmtree( tmp_dir )
- def get_repository_by_name( app, sa_session, repo_name ):
- """Get a repository from the database"""
- return sa_session.query( app.model.Repository ).filter_by( name=repo_name ).one()
- def contains( containing_str, contained_str ):
- return containing_str.lower().find( contained_str.lower() ) >= 0
- def tool_archive_extension( file_name ):
- extension = None
- if extension is None:
- head = open( file_name, 'rb' ).read( 4 )
- try:
- assert head[:3] == 'BZh'
- assert int( head[-1] ) in range( 0, 10 )
- extension = 'tar.bz2'
- except AssertionError:
- pass
- if extension is None:
- try:
- assert head[:2] == '\037\213'
- extension = 'tar.gz'
- except:
- pass
- if extension is None:
- extension = 'tar'
- return extension
- def tool_archive_file_name( tool, file_name ):
- return '%s_%s.%s' % ( tool.tool_id, tool.version, tool_archive_extension( file_name ) )
-
- def main():
- if len( sys.argv ) < 2:
- print "Usage: python %s <Tool shed config file>" % sys.argv[0]
- sys.exit( 0 )
- now = strftime( "%Y-%m-%d %H:%M:%S" )
- print " "
- print "##########################################"
- print "%s - Migrating current tool archives to new tool repositories" % now
- # tool_shed_wsgi.ini file
- ini_file = sys.argv[1]
- conf_parser = ConfigParser.ConfigParser( {'here':os.getcwd()} )
- conf_parser.read( ini_file )
- try:
- db_conn_str = conf_parser.get( "app:main", "database_connection" )
- except ConfigParser.NoOptionError, e:
- db_conn_str = conf_parser.get( "app:main", "database_file" )
- print 'DB Connection: ', db_conn_str
- # Determine db connection - only postgres is supported
- if contains( db_conn_str, '///' ) and contains( db_conn_str, '?' ) and contains( db_conn_str, '&' ):
- # postgres:///galaxy_test?user=postgres&password=postgres
- db_str = db_conn_str.split( '///' )[1]
- db_name = db_str.split( '?' )[0]
- db_user = db_str.split( '?' )[1].split( '&' )[0].split( '=' )[1]
- db_password = db_str.split( '?' )[1].split( '&' )[1].split( '=' )[1]
- elif contains( db_conn_str, '//' ) and contains( db_conn_str, ':' ):
- # dialect://user:password@host/db_name
- db_name = db_conn_str.split('/')[-1]
- db_user = db_conn_str.split('//')[1].split(':')[0]
- # Instantiate app
- configuration = {}
- for key, value in conf_parser.items( "app:main" ):
- configuration[key] = value
- app = galaxy.webapps.tool_shed.app.UniverseApplication( global_conf=dict( __file__=ini_file ), **configuration )
- sa_session = app.model.context
- # Remove the hgweb.config file if it exists
- hgweb_config = "%s/hgweb.config" % os.getcwd()
- if os.path.exists( hgweb_config ):
- print "Removing old file: ", hgweb_config
- os.remove( hgweb_config )
- repo_records = 0
- rca_records = 0
- rra_records = 0
- for repo in sa_session.query( app.model.Repository ):
- # Remove the hg repository from disk. We have to be careful here, because old
- # tool files exist in app.config.file_path/tools and we don't want to delete them
- dir = os.path.join( app.config.file_path, *directory_hash_id( repo.id ) )
- if os.path.exists( dir ):
- print "Removing old repository file directory: ", dir
- shutil.rmtree( dir )
- # Delete all records from db tables:
- # repository_category_association, repository_rating_association, repository
- print "Deleting db records for repository: ", repo.name
- for rca in repo.categories:
- sa_session.delete( rca )
- rca_records += 1
- for rra in repo.ratings:
- sa_session.delete( rra )
- rra_records += 1
- sa_session.delete( repo )
- repo_records += 1
- sa_session.flush()
- print "Deleted %d rows from the repository table" % repo_records
- print "Deleted %d rows from the repository_category_association table" % rca_records
- print "Deleted %d rows from the repository_rating_association table" % rra_records
- # Migrate database tool, tool category and tool rating records to new
- # database repository, repository category and repository rating records
- # and create the hg repository on disk for each.
- for tool in get_approved_tools( app, sa_session ):
- create_repository_from_tool( app, sa_session, tool )
- # Add, commit and push all valid versions of each approved tool to the
- # associated hg repository.
- for tool in get_approved_tools( app, sa_session ):
- add_tool_files_to_repository( app, sa_session, tool )
- app.shutdown()
- print ' '
- print 'Migration to next gen tool shed complete...'
- print "##########################################"
- sys.exit(0)
- if __name__ == "__main__":
- main()