OCLC-to-bibtex.awk - OCLC-to-bibtex.awk is an GAWK script t…

/unixSoft/bin/OCLC-to-bibtex.awk

https://bitbucket.org/durin42/dotfiles · AWK · 244 lines · 182 code · 40 blank · 22 comment · 55 complexity · 2657f9c0278975191d0635d693241770 MD5 · raw file

# 
# OCLC-to-bibtex.awk is an GAWK script to convert the export format of the 
# OCLC databases to BibTeX. It processes the input and tries to convert it into
# BibTeX entries which are written to a file in /tmp. This file is then opened 
# using the program specified in "prog" (by default: emacsclient). 
#
# NOTE: It does not do an extensive job of testing what kind of publications
# are being processed. It has some rudimentary checks of discovering wether the 
# processed publications are either InBook's or Articles.
# 
# Hedderik van Rijn, 020912-020914
#
# Do whatever you want with this script, but if you improve it, please send me a copy! 
# email: hvr-OCLC@van-rijn.org
#

BEGIN {
   tmpfile = "/tmp/tobib." systime() ".tmp.bib";
   oclc-version =  "OLCL-to-bibtex v0.1";

   # External interactive progs
#   prog = "xless ";
   prog = "emacsclient ";
#  prog = "open -a TextEdit ";
   atEnd = "&";

   # (Indirect) Output to stdout
#  prog = "cat ";
#  atEnd = "";


   print "# Exported from the OLCL FirstSearch PsychINFO database using" olcl-version;


    
}

/* -------------------------------------------------------------------------  */

(match($1,/[A-Za-z]+:/) ||  match($2,/[A-Za-z]+:/)) {

  if (inDescriptor == 1) {
    keywords = keywords "}";
    inDescriptor = 0;
  }
  if (inAbstract == 1) {
    abstract = abstract "}";
    inAbstract = 0;
  } 
}

(!match($1,/[A-Za-z()]+:/) && !match($2,/[A-Za-z()]+:/)) {

  if (inDescriptor == 1) {
    keywords = keywords ", " $0;
  }

  if (inAbstract == 1) {
    abstract = abstract " " $0;
  }
}



$1 == "Author(s):" {
  author = "\tauthor = {";
  gsub(/Affiliation:.*/,"")
  firstauthor = 1;
  for (i=2;i<=NF;i++) {
    if ($i == ";") {
      $i = "and";
      firstauthor = 0;
    }
    author = author $i;	
    if (firstauthor) {
      mainauthor = mainauthor tolower($i);
    }
    if (match($i,",")) {
      firstauthor = 0;
    }
    
    if (i<NF) {
      author = author " ";
    }
  }			
  author = author "}";
  gsub(",","",mainauthor)

}

$1 == "Descriptor:" {
  inDescriptor = 1;
  gsub(/Descriptor:[ \t]+/,"")
  gsub(/\(Major\):[ \t]+/,"")
  keywords = "\tkeywords = {{" $0;
}

$1 == "Identifier:" {
  descriptor = 0;
  gsub(/Identifier:[ \t]+/,"")
  keywords = keywords "{" $0 "}}";
}

$1 == "Source:" {
  if ($2 == "In:") {
    type = 1; # In Book
    
    pages = "\tpages = {" $NF "}";
    gsub("-","--",pages)

    booktitle = "";
    for (i=NF-2;$i != "Ed;";i--) {
      if (booktitle == "") {
	booktitle = $i;
      } else {
	booktitle = $i " " booktitle;
      }
    }
    gsub(";","",booktitle);
    booktitle = "\tbooktitle = {" booktitle "}";
    gsub("\\.}","}",booktitle);


    editors = "";
    for (;i > 2;i--) {
      if (editors == "") {
	editors = $i;
      } else {
	editors = $i " " editors;
      }
    }
    gsub(" Ed;","",editors);
    gsub("; "," and ",editors);	
    gsub(";","",editors);
    editors = "\teditors = {" editors "}";
  } else {
    type = 2; # Journal
    
    journal = "\tjournal = {";	
    for (i=2;$i!="Vol";i++) {	
      journal = journal $i " ";	
    } 		
    journal = journal "}";	
    i++;
    vol = $i;
    sub(/\(.*\),/,"",vol)
    volume = "\tvolume = {" vol "}"
    sub(/.*\(/,"",$i)
    sub(/\),/,"",$i)
    number = "\tnumber = {" $i "}"
    i++;
    if ($i+1 == 1) { # Skip the month if necessary
      i++;
    }
    sub(",","",$i);
    year = "\tyear = {" $i "}";
    sub("[0-9][0-9]","",$i);
    mainyear = $i;

    pages = "\tpages = {" $NF "}";
    gsub("-","--",pages)
    gsub("\\.","",pages)

  }

}  

$1 == "Title:" {
  title = "\ttitle = {";
  for (i=2;i<=NF;i++) {
    if ($i == toupper($i)) {
      $i = "{" $i "}";
    } else {
      gsub(/[A-Z]/,"{&}",$i);
    }
    title = title ($i);	
    if (i<NF) {
      title = title " ";
    }
  }			
  title = title "}";
  gsub("\\.}","}",title);

}

$1 == "Abstract:" {
  gsub(/Abstract:[ \t]*/,"")
  abstract = "\tabstract = {" $0;
  inAbstract = 1;
}

## Use the Accession No: for the year if the year has not been found yet. 

$1 == "Accession" {
  if (mainyear == "") {
    gsub(/-.*/,"",$3);
    year = "\tyear = {" $3 " (had to use heuristics to determine the year!)}";

    sub("[0-9][0-9]","",$3);
    mainyear = $3 "?";
  }

}

function printEntry() {
  if (mainauthor != "") {
    if (type == 1) { # In Book
      typestring = "InBook";
    } else {
      typestring = "Article";
    }      
    print("@" typestring "{" mainauthor ":" mainyear "x,") >> tmpfile;
    print(author ",") >> tmpfile;
    print(title ",") >> tmpfile;
    print(year ",") >> tmpfile;
    if (type == 1) { # In Book
      print(booktitle ",") >> tmpfile;
      print(editors ",") >> tmpfile;
      print(pages ",") >> tmpfile;
    } 
    if (type == 2) { # Article
      print(journal ",") >> tmpfile;
      print(volume ",") >> tmpfile;
      print(number ",") >> tmpfile;
      print(pages ",") >> tmpfile;
    } 
    print(abstract ",") >> tmpfile;
    print(keywords) >> tmpfile;
    print("}") >> tmpfile;
    print("") >> tmpfile;
    print("") >> tmpfile;
  }
  mainauthor = "";
  mainyear = "";
}  

NF == 0 {
    printEntry();
}

END {
  printEntry();
  system(prog " " tmpfile " " atEnd);
}
Alerts (4)

Complexity hotspot; lines 112 to 113 (total complexity: 4)
112 113
Complexity hotspot; lines 206 to 207 (total complexity: 4)
206 207