/scripts/tool_shed/migrate_tools_to_repositories.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 337 lines · 318 code · 1 blank · 18 comment · 0 complexity · f58a55c7426a3ec2dad1f07373290085 MD5 · raw file

  1. #!/usr/bin/env python
  2. '''
  3. Migrate old Galaxy tool shed to next gen Galaxy tool shed. Specifically, the tool archives stored as
  4. files in the old tool shed will be migrated to mercurial repositories in the next gen tool shed. This
  5. script can be run any number of times as it initially eliminates any current repositories and db records
  6. associated with them, and migrates old tool shed stuff to new tool shed stuff.
  7. ====== CRITICAL =======
  8. 0. This script must be run on a repo updated to changeset: 5621:4618be57481b
  9. 1. Before running this script, make sure the following config setting is set in tool_shed_wsgi.ini
  10. # Enable next-gen tool shed features
  11. enable_next_gen_tool_shed = True
  12. 2. This script requires the Galaxy instance to use Postgres for database storage.
  13. To run this script, use "sh migrate_tools_to_repositories.sh" from this directory
  14. '''
  15. import sys, os, subprocess, ConfigParser, shutil, tarfile, tempfile
  16. assert sys.version_info[:2] >= ( 2, 4 )
  17. new_path = [ os.path.join( os.getcwd(), "lib" ) ]
  18. new_path.extend( sys.path[1:] ) # remove scripts/ from the path
  19. sys.path = new_path
  20. from galaxy import eggs
  21. import pkg_resources
  22. pkg_resources.require( "psycopg2" )
  23. import psycopg2
  24. import galaxy.webapps.tool_shed.app
  25. from mercurial import hg, ui, httprepo, commands
  26. from time import strftime
  27. def directory_hash_id( id ):
  28. s = str( id )
  29. l = len( s )
  30. # Shortcut -- ids 0-999 go under ../000/
  31. if l < 4:
  32. return [ "000" ]
  33. # Pad with zeros until a multiple of three
  34. padded = ( ( ( 3 - len( s ) ) % 3 ) * "0" ) + s
  35. # Drop the last three digits -- 1000 files per directory
  36. padded = padded[:-3]
  37. # Break into chunks of three
  38. return [ padded[i*3:(i+1)*3] for i in range( len( padded ) // 3 ) ]
  39. def get_versions( app, item ):
  40. """Get all versions of item whose state is a valid state"""
  41. valid_states = [ app.model.Tool.states.NEW,
  42. app.model.Tool.states.WAITING,
  43. app.model.Tool.states.APPROVED,
  44. app.model.Tool.states.ARCHIVED ]
  45. versions = [ item ]
  46. this_item = item
  47. while item.newer_version:
  48. if item.newer_version.state in valid_states:
  49. versions.append( item.newer_version )
  50. item = item.newer_version
  51. item = this_item
  52. while item.older_version:
  53. if item.older_version[ 0 ].state in valid_states:
  54. versions.insert( 0, item.older_version[ 0 ] )
  55. item = item.older_version[ 0 ]
  56. return versions
  57. def get_approved_tools( app, sa_session ):
  58. """Get only the latest version of each tool from the database whose state is approved"""
  59. tools = []
  60. for tool in sa_session.query( app.model.Tool ) \
  61. .order_by( app.model.Tool.table.c.name ):
  62. if tool.state == app.model.Tool.states.APPROVED:
  63. tools.append( tool )
  64. return tools
  65. def create_repository_from_tool( app, sa_session, tool ):
  66. # Make the repository name a form of the tool's tool_id by
  67. # lower-casing everything and replacing any blank spaces with underscores.
  68. repo_name = tool.tool_id.lower().replace( ' ', '_' )
  69. print "Creating repository '%s' in database" % ( repo_name )
  70. repository = app.model.Repository( name=repo_name,
  71. description=tool.description,
  72. user_id = tool.user_id )
  73. # Flush to get the id
  74. sa_session.add( repository )
  75. sa_session.flush()
  76. # Determine the local repository's path on disk
  77. dir = os.path.join( app.config.file_path, *directory_hash_id( repository.id ) )
  78. # Create directory if it does not exist
  79. if not os.path.exists( dir ):
  80. os.makedirs( dir )
  81. # Define repository name inside hashed directory
  82. repository_path = os.path.join( dir, "repo_%d" % repository.id )
  83. # Create repository directory
  84. if not os.path.exists( repository_path ):
  85. os.makedirs( repository_path )
  86. # Create the local hg repository
  87. print "Creating repository '%s' on disk" % ( os.path.abspath( repository_path ) )
  88. repo = hg.repository( ui.ui(), os.path.abspath( repository_path ), create=True )
  89. # Add an entry in the hgweb.config file for the new repository - this enables calls to repository.repo_path
  90. add_hgweb_config_entry( repository, repository_path )
  91. # Migrate tool categories
  92. for tca in tool.categories:
  93. category = tca.category
  94. print "Associating category '%s' with repository '%s' in database" % ( category.name, repository.name )
  95. rca = app.model.RepositoryCategoryAssociation( repository, category )
  96. sa_session.add( rca )
  97. sa_session.flush()
  98. # Migrate tool ratings
  99. print "Associating ratings for tool '%s' with repository '%s'" % ( tool.name, repository.name )
  100. for tra in tool.ratings:
  101. rra = app.model.RepositoryRatingAssociation( user=tra.user,
  102. rating=tra.rating,
  103. comment=tra.comment )
  104. rra.repository=repository
  105. sa_session.add( rra )
  106. sa_session.flush()
  107. def add_hgweb_config_entry( repository, repository_path ):
  108. # Add an entry in the hgweb.config file for a new repository. This enables calls to repository.repo_path.
  109. # An entry looks something like: repos/test/mira_assembler = database/community_files/000/repo_123
  110. hgweb_config = "%s/hgweb.config" % os.getcwd()
  111. entry = "repos/%s/%s = %s" % ( repository.user.username, repository.name, repository_path.lstrip( './' ) )
  112. if os.path.exists( hgweb_config ):
  113. output = open( hgweb_config, 'a' )
  114. else:
  115. output = open( hgweb_config, 'w' )
  116. output.write( '[paths]\n' )
  117. output.write( "%s\n" % entry )
  118. output.close()
  119. def create_hgrc_file( repository ):
  120. # At this point, an entry for the repository is required to be in the hgweb.config
  121. # file so we can call repository.repo_path.
  122. # Create a .hg/hgrc file that looks something like this:
  123. # [web]
  124. # allow_push = test
  125. # name = convert_characters1
  126. # push_ssl = False
  127. # Upon repository creation, only the owner can push to it ( allow_push setting ),
  128. # and since we support both http and https, we set push_ssl to False to override
  129. # the default (which is True) in the mercurial api.
  130. hgrc_file = os.path.abspath( os.path.join( repository.repo_path, ".hg", "hgrc" ) )
  131. output = open( hgrc_file, 'w' )
  132. output.write( '[web]\n' )
  133. output.write( 'allow_push = %s\n' % repository.user.username )
  134. output.write( 'name = %s\n' % repository.name )
  135. output.write( 'push_ssl = false\n' )
  136. output.flush()
  137. output.close()
  138. def add_tool_files_to_repository( app, sa_session, tool ):
  139. current_working_dir = os.getcwd()
  140. # Get the repository to which the tool will be migrated
  141. repo_name = tool.tool_id.lower().replace( ' ', '_' )
  142. repository = get_repository_by_name( app, sa_session, repo_name )
  143. repo_path = os.path.abspath( repository.repo_path )
  144. # Get all valid versions of the tool
  145. tool_versions = get_versions( app, tool )
  146. for tool_version in tool_versions:
  147. print "------------------------------"
  148. print "Migrating tool '%s' version '%s' from archive to repository '%s'" % ( tool_version.tool_id, tool_version.version, repo_path )
  149. # Make a temporary working directory
  150. tmp_dir = tempfile.mkdtemp()
  151. tmp_archive_dir = os.path.join( tmp_dir, 'tmp_archive_dir' )
  152. if not os.path.exists( tmp_archive_dir ):
  153. os.makedirs( tmp_archive_dir )
  154. cmd = "hg clone %s" % repo_path
  155. os.chdir( tmp_archive_dir )
  156. os.system( cmd )
  157. os.chdir( current_working_dir )
  158. cloned_repo_dir = os.path.join( tmp_archive_dir, 'repo_%d' % repository.id )
  159. # We want these change sets to be associated with the owner of the repository, so we'll
  160. # set the HGUSER environment variable accordingly. We do this because in the mercurial
  161. # api, the default username to be used in commits is determined in this order: $HGUSER,
  162. # [ui] section of hgrcs, $EMAIL and stop searching if one of these is set.
  163. os.environ[ 'HGUSER' ] = repository.user.username
  164. # Copy the tool archive to the tmp_archive_dir. The src file cannot be derived from
  165. # tool.file_name here because we have not loaded the Tool class in the model, so the
  166. # tool.file_name defaults to /tmp/...
  167. dir = os.path.join( app.config.file_path, 'tools', *directory_hash_id( tool_version.id ) )
  168. src = os.path.abspath( os.path.join( dir, 'tool_%d.dat' % tool_version.id ) )
  169. dst = os.path.join( tmp_archive_dir, tool_archive_file_name( tool_version, src ) )
  170. shutil.copy( src, dst )
  171. # Extract the archive to cloned_repo_dir
  172. tarfile.open( dst ).extractall( path=cloned_repo_dir )
  173. # Remove the archive
  174. os.remove( dst )
  175. # Change current working directory to the cloned repository
  176. os.chdir( cloned_repo_dir )
  177. for root, dirs, files in os.walk( cloned_repo_dir ):
  178. if '.hg' in dirs:
  179. # Don't visit .hg directories
  180. dirs.remove( '.hg' )
  181. if 'hgrc' in files:
  182. # Don't include hgrc files in commit - should be impossible
  183. # since we don't visit .hg dirs, but just in case...
  184. files.remove( 'hgrc' )
  185. for dir in dirs:
  186. os.system( "hg add %s" % dir )
  187. for name in files:
  188. print "Adding file '%s' to cloned repository at %s" % ( name, str( os.getcwd() ) )
  189. os.system( "hg add %s" % name )
  190. print "Committing change set to cloned repository at %s" % str( os.getcwd() )
  191. os.system( "hg commit -m 'Migrated tool version %s from old tool shed archive to new tool shed repository'" % tool_version.version )
  192. print "Pushing changeset from cloned repository '%s' to repository '%s'" % ( cloned_repo_dir, repo_path )
  193. cmd = "hg push %s" % repo_path
  194. print "cmd is: ", cmd
  195. os.system( cmd )
  196. # The tool shed includes a repository source file browser, which currently depends upon
  197. # copies of the hg repository file store in the repo_path for browsing. We'll do the
  198. # following to make these copies.
  199. os.chdir( repo_path )
  200. os.system( 'hg update' )
  201. # Change the current working directory to the original
  202. os.chdir( current_working_dir )
  203. # Now that we have out new repository made current with all change sets,
  204. # we'll create a hgrc file for it.
  205. create_hgrc_file( repository )
  206. # Remove tmp directory
  207. shutil.rmtree( tmp_dir )
  208. def get_repository_by_name( app, sa_session, repo_name ):
  209. """Get a repository from the database"""
  210. return sa_session.query( app.model.Repository ).filter_by( name=repo_name ).one()
  211. def contains( containing_str, contained_str ):
  212. return containing_str.lower().find( contained_str.lower() ) >= 0
  213. def tool_archive_extension( file_name ):
  214. extension = None
  215. if extension is None:
  216. head = open( file_name, 'rb' ).read( 4 )
  217. try:
  218. assert head[:3] == 'BZh'
  219. assert int( head[-1] ) in range( 0, 10 )
  220. extension = 'tar.bz2'
  221. except AssertionError:
  222. pass
  223. if extension is None:
  224. try:
  225. assert head[:2] == '\037\213'
  226. extension = 'tar.gz'
  227. except:
  228. pass
  229. if extension is None:
  230. extension = 'tar'
  231. return extension
  232. def tool_archive_file_name( tool, file_name ):
  233. return '%s_%s.%s' % ( tool.tool_id, tool.version, tool_archive_extension( file_name ) )
  234. def main():
  235. if len( sys.argv ) < 2:
  236. print "Usage: python %s <Tool shed config file>" % sys.argv[0]
  237. sys.exit( 0 )
  238. now = strftime( "%Y-%m-%d %H:%M:%S" )
  239. print " "
  240. print "##########################################"
  241. print "%s - Migrating current tool archives to new tool repositories" % now
  242. # tool_shed_wsgi.ini file
  243. ini_file = sys.argv[1]
  244. conf_parser = ConfigParser.ConfigParser( {'here':os.getcwd()} )
  245. conf_parser.read( ini_file )
  246. try:
  247. db_conn_str = conf_parser.get( "app:main", "database_connection" )
  248. except ConfigParser.NoOptionError, e:
  249. db_conn_str = conf_parser.get( "app:main", "database_file" )
  250. print 'DB Connection: ', db_conn_str
  251. # Determine db connection - only postgres is supported
  252. if contains( db_conn_str, '///' ) and contains( db_conn_str, '?' ) and contains( db_conn_str, '&' ):
  253. # postgres:///galaxy_test?user=postgres&password=postgres
  254. db_str = db_conn_str.split( '///' )[1]
  255. db_name = db_str.split( '?' )[0]
  256. db_user = db_str.split( '?' )[1].split( '&' )[0].split( '=' )[1]
  257. db_password = db_str.split( '?' )[1].split( '&' )[1].split( '=' )[1]
  258. elif contains( db_conn_str, '//' ) and contains( db_conn_str, ':' ):
  259. # dialect://user:password@host/db_name
  260. db_name = db_conn_str.split('/')[-1]
  261. db_user = db_conn_str.split('//')[1].split(':')[0]
  262. # Instantiate app
  263. configuration = {}
  264. for key, value in conf_parser.items( "app:main" ):
  265. configuration[key] = value
  266. app = galaxy.webapps.tool_shed.app.UniverseApplication( global_conf=dict( __file__=ini_file ), **configuration )
  267. sa_session = app.model.context
  268. # Remove the hgweb.config file if it exists
  269. hgweb_config = "%s/hgweb.config" % os.getcwd()
  270. if os.path.exists( hgweb_config ):
  271. print "Removing old file: ", hgweb_config
  272. os.remove( hgweb_config )
  273. repo_records = 0
  274. rca_records = 0
  275. rra_records = 0
  276. for repo in sa_session.query( app.model.Repository ):
  277. # Remove the hg repository from disk. We have to be careful here, because old
  278. # tool files exist in app.config.file_path/tools and we don't want to delete them
  279. dir = os.path.join( app.config.file_path, *directory_hash_id( repo.id ) )
  280. if os.path.exists( dir ):
  281. print "Removing old repository file directory: ", dir
  282. shutil.rmtree( dir )
  283. # Delete all records from db tables:
  284. # repository_category_association, repository_rating_association, repository
  285. print "Deleting db records for repository: ", repo.name
  286. for rca in repo.categories:
  287. sa_session.delete( rca )
  288. rca_records += 1
  289. for rra in repo.ratings:
  290. sa_session.delete( rra )
  291. rra_records += 1
  292. sa_session.delete( repo )
  293. repo_records += 1
  294. sa_session.flush()
  295. print "Deleted %d rows from the repository table" % repo_records
  296. print "Deleted %d rows from the repository_category_association table" % rca_records
  297. print "Deleted %d rows from the repository_rating_association table" % rra_records
  298. # Migrate database tool, tool category and tool rating records to new
  299. # database repository, repository category and repository rating records
  300. # and create the hg repository on disk for each.
  301. for tool in get_approved_tools( app, sa_session ):
  302. create_repository_from_tool( app, sa_session, tool )
  303. # Add, commit and push all valid versions of each approved tool to the
  304. # associated hg repository.
  305. for tool in get_approved_tools( app, sa_session ):
  306. add_tool_files_to_repository( app, sa_session, tool )
  307. app.shutdown()
  308. print ' '
  309. print 'Migration to next gen tool shed complete...'
  310. print "##########################################"
  311. sys.exit(0)
  312. if __name__ == "__main__":
  313. main()