/R/pubmed2df.R

https://github.com/massimoaria/bibliometrix · R · 77 lines · 54 code · 17 blank · 6 comment · 6 complexity · d41e1f45b58a858138c264972d607b92 MD5 · raw file

  1. pubmed2df<-function(D){
  2. D <- D[nchar(D)>0] # remove empty rows
  3. for (i in 1:length(D)){
  4. if (substr(D[i],1,4)==" ") substr(D[i],1,4) <- substr(D[i-1],1,4)
  5. }
  6. Papers=which(regexpr("PMID-",D)==1) # first row of each document
  7. nP=length(Papers) # number of docuemnts
  8. rowPapers <- diff(c(Papers, length(D)+1))
  9. numPapers <- rep(1:nP,rowPapers)
  10. DATA <- data.frame(Tag = substr(D,1,4), content = substr(D,7,nchar(D)), Paper=numPapers, stringsAsFactors = FALSE)
  11. DATA$Tag <- gsub(" ","",DATA$Tag)
  12. df <- DATA %>% group_by(.data$Paper, .data$Tag) %>%
  13. summarise(cont=paste(.data$content, collapse="---",sep="")) %>%
  14. arrange(.data$Tag, .data$Paper) %>%
  15. pivot_wider(names_from = .data$Tag,values_from = .data$cont) %>%
  16. ungroup() %>%
  17. as.data.frame()
  18. # rename field tags
  19. error <- 0
  20. old_labs <- c("AD","AUID","FAU","IS","IP","SO","JT","TA","MH","PG","PT","VI","DP")
  21. new_labs <- c("C1","OI","AF","SN","IS","SO2","SO","J9","DE","PP","DT","VL","PY")
  22. lab <- names(df)
  23. for (j in 1:length(old_labs)){
  24. i <- which(lab %in% old_labs[j])
  25. if (length(i)>0) {lab[i] <- new_labs[j]}else{error <- 1}
  26. }
  27. names(df) <- lab
  28. if (error == 1){
  29. cat("\nWarning:\nIn your file, some mandatory metadata are missing. Bibliometrix functions may not work properly!\n
  30. Please, take a look at the vignettes:
  31. - 'Data Importing and Converting' (https://cran.r-project.org/web/packages/bibliometrix/vignettes/Data-Importing-and-Converting.html)
  32. - 'A brief introduction to bibliometrix' (https://cran.r-project.org/web/packages/bibliometrix/vignettes/bibliometrix-vignette.html)\n\n")
  33. }
  34. # extract DOIs
  35. df$DI <- trimws(unlist(lapply(strsplit(df$LID,"\\["), "[",1)))
  36. df$PY <- as.numeric(substr(df$PY,1,4))
  37. ### replace "---" with ";"
  38. tagsComma <- c("AU","AF","DE","AID","OT","PHST","DT")
  39. nolab <- setdiff(tagsComma,names(df))
  40. tagsComma <- tagsComma[(!(tagsComma %in% nolab))]
  41. df1 <- data.frame(lapply(df[tagsComma],function(x){
  42. gsub("---",";",x)
  43. }),stringsAsFactors = FALSE)
  44. ### replace "---" with " "
  45. otherTags <- setdiff(names(df),tagsComma)
  46. df2 <- data.frame(lapply(df[otherTags],function(x){
  47. trimES(gsub("---"," ",x))
  48. }),stringsAsFactors = FALSE)
  49. df <- cbind(df1,df2)
  50. rm(df1,df2)
  51. df$DB <- "PUBMED"
  52. # remove * char from keywords
  53. df$DE <- df$ID <- gsub("\\*","",df$DE)
  54. df <- data.frame(lapply(df,toupper),stringsAsFactors = FALSE)
  55. # add sep ; to affiliations
  56. df$C1 <- gsub("\\.",".;",df$C1)
  57. df$RP <- NA
  58. df <- df[names(df)!="Paper"]
  59. return(df)
  60. }