/lib/jwa_ingestor.rb
Ruby | 128 lines | 122 code | 6 blank | 0 comment | 4 complexity | 63b2bd2c69bdccf5aff90eebca34533b MD5 | raw file
Possible License(s): GPL-2.0, IPL-1.0
- require 'rubygems'
- gem 'fastercsv'
- require 'fastercsv'
- require 'active_fedora'
- class JWAIngestor
- def initialize(rootdir, job)
- Fedora::Repository.register(FEDORA_URL)
- @bnmap = Hash.new
- ActiveFedora::SolrService.register(SOLR_URL)
- @rootdir=rootdir
- @job = job
- self
- end
- def run(dir=@rootdir, parent=nil)
- process(dir)
- end
- def process(file, parent=nil)
- o=nil
- if(parent)
- o = ActiveFedora::Base.new
- o.add_relationship(:is_member_of, parent)
- else
- o = OralHistory.new
- Dir["#{file}/*"].each do |f|
- @job.log("processing file #{f}")
- if f =~ /(.*)\.(mp3|wav)/
- base = File.basename($1)
- a=@bnmap[base]
- unless a
- a = AudioRecord.new
- a.add_relationship(:is_part_of, o)
- @bnmap[base]=a
- end
- @job.log("processing file as audiorecord ds")
- a.add_datastream(create_file_ds(f))
- a.save
- else
- unless File.directory?(f)
- process_metadata(f,o) if f =~/metadata.csv/
- o.add_datastream(create_file_ds(f))
- else
- process(f, o)
- end
- end
- end
- end
- o.save
- o.pid
- end
- def process_metadata(file, oral)
- @job.log("processing metadata file #{File.basename(file)}")
- pstream = oral.datastreams['properties']
- dc = oral.datastreams['dublin_core']
- sen_pass = oral.datastreams['sensitive_passages']
- sig_pass = oral.datastreams['significant_passages']
- field_map = {
- 'contributor.narrator'=>[pstream, 'narrator'],
- 'publisher'=>[dc, 'publisher'],
- 'creator'=>[dc, 'creator'],
- 'format.extent'=>[dc, 'extent'],
- 'format.medium'=>[dc, 'medium'],
- 'format.mimetype'=>[dc, 'format'],
- 'type'=>[dc, 'type'],
- 'rights'=>[dc, 'rights'],
- 'language'=>[dc, 'language'],
- 'significant_passage'=>[sig_pass, 'significant_passage'],
- 'description.bio'=>[pstream, 'bio'],
- 'description.abstract'=>[dc, 'description'],
- 'Significant passages'=>[sig_pass, 'significant_passage'],
- 'sensitive_passage'=>[sen_pass, 'sensitive_passage'],
- 'Sensitive passages'=>[sen_pass, 'sensitive_passage'],
- 'format.medium.available'=>[pstream, 'hard_copy_availability'],
- 'format.medium.where'=>[pstream, 'hard_copy_location'],
- 'contributor.other'=>[pstream, 'other_contributor'],
- 'contributor.transcripteditor'=>[pstream, 'transcript_editor'],
- 'title'=>[dc, 'title'],
- 'subject'=>[dc, 'subject'],
- 'date'=>[dc, 'date'],
- 'title.alternative'=>[dc, 'alternative'],
- 'contributor.interviewer'=>[pstream, 'interviewer'],
- 'location.recording'=>[pstream, 'location'],
- 'coverage.temporal'=>[dc, 'temporal'],
- 'coverage.spatial'=>[dc, 'spatial'],
- 'coverage.spacial'=>[dc, 'spatial'],
- 'subject.lcsh' =>[dc, 'subject_heading'],
- 'notes'=>[pstream, 'notes']
- }
- FasterCSV::foreach(file) do |row|
- row.compact!
- d = field_map[row.first.strip]
- if d
- d.first.send("#{d.last}_append", demoronize(row.last.strip)) if row.size >1
- else
- @job.log "unknown field #{row.inspect}" if RAILS_ENV=='production'
- end
- end
- end
- private
- def demoronize(str)
- s = str.dup
- s.gsub!(/\x82/,',')
- s.gsub!(/\x84/,',,')
- s.gsub!(/\x85/,'...')
- s.gsub!(/\x88/,'^')
- s.gsub!(/\x89/,'o/oo')
- s.gsub!(/\x8b/,'<')
- s.gsub!(/\x8c/,'OE')
- s.gsub!(/\x91|\x92/,"'")
- s.gsub!(/\x93|\x94/,'"')
- s.gsub!(/\x95/,'*')
- s.gsub!(/\x96/,'-')
- s.gsub!(/\x97/,'--')
- s.gsub!(/\x98/,'~')
- s.gsub!(/\x99/,'TM')
- s.gsub!(/\x9b/,'>')
- s.gsub(/\x9c/,'oe')
- end
- def create_file_ds(f)
- @job.log("creating file ds ")
- ActiveFedora::Datastream.new(:dsID=>File.basename(f), :controlGroup=>'M', :blob=>File.open(f), :dsLabel=>File.basename(f))
- end
- end