PageRenderTime 27ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/marshal/marshal.R

https://gitlab.com/proteomics-bioinformatics-lnbio/galaxy_proteomics
R | 202 lines | 153 code | 13 blank | 36 comment | 47 complexity | bfa08efe4bf7033849590394569c0367 MD5 | raw file
  1. #!/usr/bin/env Rscript
  2. # marshal.R
  3. # AUTHOR: Daniel Travieso
  4. # E-mail: danielgtravieso@gmail.com
  5. # LAST REVISED: August 2015
  6. #
  7. # Required packages to work: ("getopt")
  8. # Laboratory of Mass Spectrometry at Brazilian Biosciences National Laboratory
  9. # http://lnbio.cnpem.br/
  10. # Copyright CC BY-NC-SA (c) 2014 Brazilian Center for Research in Energy and Materials
  11. # All rights reserved.
  12. require('methods', quietly=TRUE);
  13. require('getopt', quietly=TRUE);
  14. require('RMySQL', quietly=TRUE);
  15. #define de options input that the code will have
  16. opt = matrix(c(
  17. 'inputfile_name', 'i', 1, 'character',
  18. 'tax_id', 't', 1, 'character',
  19. 'keepcon', 'k', 1, 'character',
  20. 'outputfile_name', 'o', 1, 'character'
  21. ),byrow=TRUE, ncol=4);
  22. # parse de input
  23. options = getopt(opt);
  24. table <- read.delim(options$inputfile_name, header=TRUE, fill=TRUE);
  25. db.socket <- '/var/run/mysqld/mysqld.sock';
  26. #Database connections and queries
  27. db.connection <- dbConnect(RMySQL::MySQL(), user='galaxy', host='localhost', dbname='conversionMarcelo', password='123456', unix.sock=db.socket);
  28. # the '?' will be replaced in the query
  29. db.sql.synonym <- "SELECT synonyms FROM Synonyms2Uniprot WHERE uniprot = ";
  30. db.sql.uniprot <- "SELECT uniprot FROM Synonyms2Uniprot WHERE synonyms = ";
  31. db.sql.all <- "SELECT * FROM Synonyms2Uniprot WHERE synonyms = ";
  32. #Definition of all regular expressions to be used
  33. regex.intensity <- "^Intensity[.]([^[:digit:]]+)[[:digit:]]+$";
  34. regex.lfqintensity <- "^LFQ[.]intensity[.]([^[:digit:]]+)[[:digit:]]+$";
  35. regex.spectral <- "^MS[.]MS[.]Count[.]([^[:digit:]]+)[[:digit:]]+$";
  36. regex.proteinIDs <- "^Protein[.]IDs$";
  37. regex.id.uniprot.1 <- "^[OPQ][[:digit:]][[:upper:][:digit:]]{3}[[:digit:]]";
  38. regex.id.uniprot.2 <- "^[A-NR-Z][[:digit:]]([[:upper:]][[:upper:][:digit:]]{2}[[:digit:]]){1,2}";
  39. regex.id.ipi <- "^IPI[[:digit:]]+$";
  40. regex.id.tair <- "^A[Tt][1-5MC][:alnum:][[:digit:]]+$";
  41. regex.id.ensembl <- "^ENS[[:alnum:]]+$";
  42. regex.id.refseq <- "^AC_|^N[CGTWSZMR]_|^X[MR]_|^[ANYXZ]P_";
  43. regex.id.contaminant_reversed <- "^(CON__|REV__)";
  44. regex.id.contaminant <- "^CON__";
  45. #get the names for the columns and separate them for better use
  46. column_names.intensity <- grep(regex.intensity, colnames(table), value=TRUE);
  47. column_names.lfqintensity <- grep(regex.lfqintensity, colnames(table), value=TRUE);
  48. column_names.spectral <- grep(regex.spectral, colnames(table), value=TRUE);
  49. column_names.proteinIDs <- grep(regex.proteinIDs, colnames(table), value=TRUE);
  50. column_names.uniprot_conversion <- "Uniprot.Conversion.ID";
  51. #create names of categories for extra row to be included
  52. categories.intensity <- gsub(regex.intensity, '\\1.intensity', column_names.intensity);
  53. categories.lfqintensity <- gsub(regex.lfqintensity, '\\1.lfq.intensity.', column_names.lfqintensity);
  54. categories.spectral <- gsub(regex.spectral, '\\1.speccount', column_names.spectral);
  55. #add a blank column
  56. table <- cbind(table[,c(column_names.proteinIDs)], rep("", nrow(table)), table[,c(column_names.intensity, column_names.lfqintensity, column_names.spectral)])
  57. #rename the column names
  58. colnames(table) <- c(column_names.proteinIDs,column_names.uniprot_conversion,column_names.intensity, column_names.lfqintensity, column_names.spectral)
  59. #set the protein IDs column as character
  60. table[,column_names.proteinIDs] <- sapply(table[,column_names.proteinIDs], as.character);
  61. #add a blank row
  62. table <- rbind(table[0,], c("", "", categories.intensity, categories.lfqintensity, categories.spectral), table[seq(1, nrow(table)),]);
  63. table[,column_names.uniprot_conversion] <- sapply(table[,column_names.uniprot_conversion], as.character);
  64. cell.tax <- options$tax_id;
  65. #finding the id type
  66. rows.excluded <- c();
  67. for (row in seq(2, nrow(table))) {
  68. row.deleted <- FALSE;
  69. cell.id.notfound <- FALSE;
  70. cell.row <- row;
  71. cell.value <- strsplit(table[cell.row, column_names.proteinIDs], ';')[[1]];
  72. #print("Current table row:");
  73. #print(cell.value);
  74. cell.id <- "";
  75. for (id_code in cell.value) {
  76. # remove contaminant or reversed protein ids from search. get the first valid id
  77. if(!is.na(id_code) && !grepl(regex.id.contaminant_reversed, id_code)) {
  78. cell.id <- id_code;
  79. break;
  80. }
  81. }
  82. if (cell.id == "") {
  83. cell.id.notfound <- TRUE;
  84. # NOT FOUND ANY RELEVANT IDs (All contaminat or reversed)
  85. if (options$keepcon == "Yes") {
  86. cell.id <- grep(regex.id.contaminant, cell.value, value=TRUE)[1];
  87. if (is.na(cell.id)) {
  88. rows.excluded <- c(rows.excluded, row);
  89. row.deleted <- TRUE;
  90. } else {
  91. cell.id <- gsub(regex.id.contaminant, '', cell.id);
  92. }
  93. } else {
  94. rows.excluded <- c(rows.excluded, row);
  95. row.deleted <- TRUE;
  96. }
  97. }
  98. if (!row.deleted) {
  99. # discover the type of id
  100. if (grepl(regex.id.uniprot.1, cell.id) || grepl(regex.id.uniprot.2, cell.id)) {
  101. cell.hash <- 'uniprot';
  102. } else if(grepl(regex.id.ipi, cell.id)) {
  103. cell.hash <- 'ipi';
  104. } else if(grepl(regex.id.tair, cell.id)) {
  105. cell.hash <- 'tair';
  106. } else if(grepl(regex.id.ensembl, cell.id)) {
  107. cell.hash <- 'ensembl';
  108. } else if(grepl(regex.id.refseq, cell.id)) {
  109. cell.hash <- 'refseq'
  110. } else {
  111. cell.hash <- 'genesymbol'
  112. }
  113. # if the id is not uniprot type, get the correlant uniprot for that id
  114. if (cell.hash != 'uniprot') {
  115. # if is genesymbol use the database to search for a uniprot with same tax number
  116. if (cell.hash == 'genesymbol') {
  117. cell.id.escapeChars <- dbEscapeStrings(db.connection, cell.id);
  118. db.select.all <- paste(db.sql.all, "'", cell.id.escapeChars, "';", sep='');
  119. db.select.results <- dbGetQuery(db.connection, db.select.all);
  120. for (rrow in seq(1, nrow(db.select.results))) {
  121. db.select.results.row <- db.select.results[row,];
  122. if (db.select.results.row[[4]] == cell.tax) {
  123. cell.id.uniprot <- db.select.results.row[[2]];
  124. if (db.select.results.row[[5]] == "YES") {
  125. break;
  126. }
  127. }
  128. }
  129. # if is not genesymbol, query for all ids in the table, and get the result uniprot
  130. } else {
  131. cell.id.escapeChars <- dbEscapeStrings(db.connection, cell.id);
  132. db.select.uniprot <- paste(db.sql.uniprot, "'", cell.id.escapeChars, "';", sep='');
  133. db.select.results <- dbGetQuery(db.connection, db.select.uniprot);
  134. db.select.results.ids <- as.vector(db.select.results[[1]]);
  135. for (res in db.select.results.ids) {
  136. if (grepl(res, regex.id.uniprot.1) == TRUE ||
  137. grepl(res, regex.id.uniprot.2) == TRUE) {
  138. cell.id.uniprot <- res;
  139. break;
  140. }
  141. }
  142. }
  143. # write the uniprot conversion cell
  144. if (cell.id.notfound) {
  145. cell.id <- paste0("CON__", cell.id);
  146. }
  147. table[cell.row, column_names.proteinIDs] <- cell.id;
  148. table[cell.row, column_names.uniprot_conversion] <- paste0(cell.id, "_", cell.id.uniprot);
  149. } else {
  150. # if the id is already uniprot, do nothing and store the uniprot id value
  151. cell.id.uniprot <- cell.id;
  152. cell.id <- "";
  153. cell.id.possibleid <- c();
  154. cell.id.uniprot <- sub('-[[:digit:]]', '', cell.id.uniprot);
  155. cell.id.escapeChars <- dbEscapeStrings(db.connection, cell.id.uniprot);
  156. db.select.synonym <- paste(db.sql.synonym, "'", cell.id.escapeChars, "';", sep='');
  157. db.select.results <- dbGetQuery(db.connection, db.select.synonym);
  158. db.select.results.ids <- as.vector(db.select.results[[1]]);
  159. for (res in db.select.results.ids) {
  160. cell.id.possibleid <- c(cell.id.possibleid, res);
  161. if (!grepl('^IPI|^ENS|^A[Tt]',res)) {
  162. cell.id <- res;
  163. break;
  164. }
  165. }
  166. if (cell.id == "") {
  167. if (!is.null(cell.id.possibleid)) {
  168. cell.id <- sort(cell.id.possibleid)[1]
  169. } else {
  170. cell.id <- cell.id.uniprot;
  171. }
  172. }
  173. # write the uniprot conversion cell
  174. if (cell.id.notfound) {
  175. table[cell.row, column_names.proteinIDs] <- paste0("CON__", cell.id.uniprot);
  176. table[cell.row, column_names.uniprot_conversion] <- paste0("CON__", cell.id, "_", cell.id.uniprot);
  177. } else {
  178. table[cell.row, column_names.proteinIDs] <- cell.id.uniprot;
  179. table[cell.row, column_names.uniprot_conversion] <- paste0(cell.id, "_", cell.id.uniprot);
  180. }
  181. }
  182. }
  183. }
  184. table <- table[-rows.excluded,];
  185. #dbDisconnect(db.connection);
  186. #write out the file
  187. output_handler <- file(options$outputfile_name, "w")
  188. write.table(table, file=output_handler, sep="\t", row.names=FALSE);
  189. close(output_handler)