extract.rb | searchcode

/lib/metadata/extract.rb

https://github.com/devilcoders/metadata
Ruby | 1451 lines | 1415 code | 26 blank | 10 comment | 31 complexity | ac9858eccdad6be785cf3212fca10920 MD5 | raw file

require 'iconv'
require 'pathname'
require 'time'
require 'date'
require 'base64'

require 'lib/metadata/mime_info'


class Pathname

  attr_accessor 'mimetype'

  def mimetype
    @mimetype ||= MimeInfo.get(to_s)
  end

  def pages
    @pages ||= (metadata['Doc.PageCount'] || 1)
  end

  def dimensions
    @dimensions ||= [width, height]
  end

  def metadata(mime=mimetype, charset=nil, pdf=nil)
    @metadata ||= Metadata.extract(self, mime || mimetype, charset, pdf)
  end

  def length
    @length ||= (metadata['Audio.Duration'] || metadata['Video.Duration'] || metadata['Doc.WordCount'].to_i / 250.0)
  end

  def width
    metadata['Image.Width']
  end

  def height
    metadata['Image.Height']
  end

  def to_pn(*rest)
    pn = self
    pn = pn.join(*rest) unless rest.empty?
    pn
   end

end


class String

  def to_pn(*rest)
    pn = Pathname.new(self)
    pn = pn.join(*rest) unless rest.empty?
    pn
  end

  def chardet
    cset = IO.popen("chardet", "r+"){|cd|
      Thread.new {
        cd.write(self[0,65536])
        cd.close_write
      }
      # There's a chardet that outputs '<stdin>: ascii (Confidence: 1.00)',
      # we need to strip out the head and the tail.
      cd.read.strip.sub(/^[^:]*:\s*/,'').sub(/\s*\(.*/, '')
    }
    if cset == 'None'
      charsets = ['utf-8',
        'utf-16', 'utf-16be', 'utf-32', 'utf-32be',
        'shift-jis','euc-jp',
        'iso8859-1','cp1252',
        'big-5','gbk','gb18030','gb2312'].compact
      pk = $KCODE
      $KCODE = 'ascii'
      case self
      when /\A(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00)/
        charsets.unshift 'utf-32'
      when /\A(\xFE\xFF|\xFF\xFE)/
        charsets.unshift 'utf-16'
      when /\A\xEF\xBB\xBF/
        charsets.unshift 'utf-8'
      when /\A[a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]+\Z/m
        charsets.unshift 'ascii' unless self.include?("\000")
      end
      $KCODE = pk
      cset = charsets.find{|c|
        ((Iconv.iconv('utf-8', c, self)[0]) rescue false)
      }
    end
    if self.count("\000")*2 >= length and cset == 'ascii'
      cset = 'utf-16' + (self.index("\000") % 2 == 0 ? 'le' : 'be')
    end
    if cset =~ /windows-1255/i and self =~ /[a-z](\344|\366|\326|\304)[a-z]/
      cset = 'windows-1252'
    end
    cset
  end

  def to_utf8(charset=nil)
    us = nil
    charsets = [charset, 'utf-8',
      'utf-16', 'utf-16be', 'utf-32', 'utf-32be',
      'shift-jis','euc-jp',
      'iso8859-1','cp1252',
      'big-5','gbk','gb18030','gb2312']
    cd = chardet
    pk = $KCODE
    $KCODE = 'ascii'
    if cd
      case cd
      when /iso-8859|windows-1252/i
        na_re = /[^a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]/
        nl = gsub(na_re,'').length
        if length > 1.5 * nl
          charsets.insert(8, cd) # low ascii content
        else
          charsets.insert(2, cd) # high ascii content
        end
      when /utf/i
        charsets.insert(1, cd)
      else
        charsets.insert(2, cd)
      end
    end
    charsets.compact!
    case self
    when /\A(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00)/
      charsets.unshift 'utf-32'
      bom = true
    when /\A(\xFE\xFF|\xFF\xFE)/
      charsets.unshift 'utf-16'
      bom = true
    when /\A\xEF\xBB\xBF/
      charsets.unshift 'utf-8'
      bom = true
    when /\A[a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]+\Z/m
      charsets.unshift 'ascii' unless self.include?("\000")
    when /\301\265|\220\333/
      charsets.unshift 'gbk'
    end
    $KCODE = pk
    cset = charsets.find{|c|
      ((us = Iconv.iconv('utf-8', c, self)[0]) rescue false)
    }
    if not bom
      if cset =~ /^utf-(16|32)(le|$)/i
        na_re = /[^a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]/
        if us.length > 1.9 * us.gsub(na_re,'').length
          rcset = cset.sub(/le|$/){|m| m == 'be' ? 'le' : 'be' }
          nus = ((Iconv.iconv('utf-8', rcset, self)[0]) rescue false)
          if nus and (nus.gsub(na_re,'').length > us.gsub(na_re,'').length)
            us = nus
          end
        end
      end
    end
    us ||= self.gsub(/[^0-9a-z._ '"\*\+\-]/,'?')
    us.sub!(/\A(\x00\x00\xFE\xFF|(\xFF\xFE(\x00\x00)?)|\xEF\xBB\xBF|\xFE\xFF)/, '') # strip UTF BOMs
    us.tr!("\0", "") # strip null bytes
    us
  end

end


class Array

  def to_hash
    h = {}
    each{|k,v| h[k] = v}
    h
  end

end


class Numeric

  def points_to_mm
    self * 0.3528
  end

  def mm_to_points
    self / 0.3528
  end

end


module Metadata
extend self

  attr_accessor(:quiet, :verbose,
                :sha1sum, :md5sum,
                :no_text, :guess_title, :guess_metadata, :guess_pubdata,
                :use_citeseer, :use_dblp,
                :include_name, :include_path)

  # Extracts metadata from a file by guessing mimetype and calling matching
  # extractor methods (which mostly call external programs to do their bidding.)
  #
  #   Metadata.extract('foo.png')
  #
  # Follows the Shared File Metadata Spec naming.
  # http://wiki.freedesktop.org/wiki/Specifications/shared-filemetadata-spec
  #
  # There are a couple flags that control the behaviour of
  # the metadata extractor:
  #
  #   Metadata.sha1sum = true # include File.SHA1Sum in the metadata
  #   Metadata.md5sum = true  # include File.MD5Sum in the metadata
  #   Metadata.include_name = true # include File.Name (file basename)
  #   Metadata.include_path = true # include File.Path (file dirname)
  #   Metadata.quiet = true   # override verbose to false
  #   Metadata.verbose = true # print out status messages to stderr
  #
  # All strings are converted to UTF-8.
  #
  def extract(filename, mimetype=MimeInfo.get(filename.to_s), charset=nil, pdf=nil)
    verbose = verbose && !quiet
    filename = filename.to_s
    mimetype = Mimetype[mimetype] unless mimetype.is_a?( Mimetype )
    unless File.exist?(filename)
      rv = {}
      if self.include_name
        rv['File.Name'] = enc_utf8(File.basename(filename), nil)
      end
      if self.include_path
        rv['File.Path'] = enc_utf8(File.dirname(filename), nil)
      end
      rv['File.Format'] ||= mimetype.to_s
      return rv
    end
    mts = mimetype.ancestors
    mt = mts.shift
    rv = nil
    new_methods = public_methods(false)
    
    #verbose message
    # TODO: send in json to browser
    STDERR.puts "Processing #{filename}", " Metadata extraction" if verbose
    
    while mt.is_a?(Mimetype) and mt != Mimetype
      
      #verbose message
      # TODO: send in json to browser
      STDERR.puts "  Trying #{mt}" if verbose
      mn = mt.to_s.gsub(/[^a-z0-9]/i,"_")
      if new_methods.include?( mn )
        begin
          rv = __send__( mn, filename, charset )
          #verbose message
          # TODO: send in json to browser
          STDERR.puts "  OK" if verbose
          break
        rescue => e
          #verbose message
          # TODO: send in json to browser
          STDERR.puts(e, e.message, e.backtrace) if verbose
        end
      end
      mt = mts.shift
    end
    unless rv
      #verbose message
      # TODO: send in json to browser
      STDERR.puts "  Falling back to extract" if verbose
      rv = extract_extract_info(filename)
    end
    if self.include_name
      rv['File.Name'] = enc_utf8(File.basename(filename), nil)
    end
    if self.include_path
      rv['File.Path'] = enc_utf8(File.dirname(filename), nil)
    end
    rv['File.Format'] ||= mimetype.to_s
    if File.file?(filename)
      if self.sha1sum
        secure_filename(filename){|sfn|
          rv['File.SHA1Sum'] = `sha1sum #{sfn}`.split(" ",2)[0]
        }
      end
      if self.md5sum
        secure_filename(filename){|sfn|
          rv['File.MD5Sum'] = `md5sum #{sfn}`.split(" ",2)[0]
        }
      end
    end
    rv['File.Size'] = (
      if File.directory?(filename)
        Dir.entries(filename).size-2
      else
        File.size(filename)
      end)
    rv['File.Content'] = extract_text(filename, mimetype, charset, false) unless Metadata.no_text
    pdf ||= filename + "-temp.pdf"
    if File.exist?(pdf)
      pdf_metadata = application_pdf(pdf, charset)
      overrides = %w(Image.DimensionUnit Image.Width Image.Height Doc.PageCount
                     Doc.PageSizeName)
      optrides = %w(Doc.WordCount Doc.Title Doc.Author)
      overrides.each{|o| rv[o] = pdf_metadata[o] }
      optrides.each {|o| rv[o] ||= pdf_metadata[o] }
      if !Metadata.no_text and not to_s =~ /postscript/
        rv['File.Content'] = extract_text(pdf, Mimetype['application/pdf'], charset, false)
      end
    end
    if guess_title or guess_metadata or guess_pubdata
      gem_require 'metadata/title_guesser'
      gem_require 'metadata/publication_guesser'
      gem_require 'metadata/reference_guesser'
      text = (rv['File.Content'] || extract_text(filename, mimetype, charset, false))
      guess = extract_guesses(text)
      if guess['Doc.Title'] and (rv['Doc.Title'].nil? or rv['Doc.Title'] =~ /(^[a-z])|((\.(dvi|doc)|WORD)$)|^Slide 1$|^PowerPoint Presentation$/)
        rv['Doc.Title'] = guess['Doc.Title']
      end
    end
    if use_citeseer and rv['Doc.Title'] and mimetype.to_s =~ /pdf|postscript|dvi|tex/
      rv.merge!(citeseer_extract(rv['Doc.Title']))
    end
    if use_dblp and rv['Doc.Title'] and mimetype.to_s =~ /pdf|postscript|dvi|tex/
      rv.merge!(dblp_extract(rv['Doc.Title']))
    end
    if guess_metadata or guess_pubdata
      %w(Doc.Publisher Doc.Published Doc.Publication Doc.Genre Event.Name Event.Organizer
      ).each{|field|
        rv[field] ||= guess[field] if guess[field]
      }
    end
    if guess_metadata
      %w(Doc.Citations Doc.Description Doc.ACMCategories Doc.Keywords
      ).each{|field|
        rv[field] ||= guess[field] if guess[field]
      }
    end
    rv['File.Modified'] = parse_time(File.mtime(filename.to_s).iso8601)
    rv.delete_if{|k,v| v.nil? }
    rv
  end

  # Extracts text from a file by guessing mimetype and calling matching
  # extractor methods (which mostly call external programs to do their bidding.)
  #
  # The extracted text is converted to UTF-8.
  #
  def extract_text(filename, mimetype=MimeInfo.get(filename.to_s), charset=nil, layout=false)
    filename = filename.to_s
    mimetype = Mimetype[mimetype] unless mimetype.is_a?( Mimetype )
    mts = mimetype.ancestors
    mt = mts.shift
    new_methods = public_methods(false)
    
    #verbose message
    # TODO: send in json to browser
    STDERR.puts " Text extraction" if verbose
    
    while mt.is_a?(Mimetype) and mt != Mimetype
      
      #verbose message
      # TODO: send in json to browser
      STDERR.puts "  Trying #{mt}" if verbose
      
      mn = mt.to_s.gsub(/[^a-z0-9]/i,"_") + "__gettext"
      if new_methods.include?( mn )
        begin
          rv = __send__( mn, filename, charset, layout )
          
          #verbose message
          # TODO: send in json to browser
          STDERR.puts "  OK" if verbose
          
          return rv
        rescue => e
          
          #verbose message
          # TODO: send in json to browser
          STDERR.puts(e, e.message, e.backtrace) unless quiet
        end
      end
      mt = mts.shift
    end
    
    #verbose message
    # TODO: send in json to browser
    STDERR.puts "  Text extraction failed" if verbose
    
    nil
  end

  alias_method :[], 'extract'

  def gem_require(libname)
    retried = false
    begin
      require libname
    rescue LoadError
      unless retried
        
        #verbose message
        # TODO: send in json to browser 
        STDERR.puts "Requiring rubygems" if verbose
        
        require 'rubygems'
        retried = true
        retry
      else
        raise
      end
    end
  end

  def extract_guesses(text)
    return {} unless text
    guess = {}

    title = TitleGuesser.guess_title(text)
    pubdata = PublicationGuesser.guess_pubdata(text)

    str = remove_ligatures(text).split(/\f+/m)[0,2].join("\n")
    abstract = str.scan(
      /\babstract\s*\n(.+)\n\s*((d+\.)|(\d\.?)*\s*(keywords|categories|introduction|(\d\.?)\s*[a-z]+))\s*\n/im
    ).flatten.first

    if abstract
      abstract.gsub!(/\A(\s*[a-z]+@([a-z]+\.)+[a-z]+\s*)+/im, '')
      if abstract.size > 500
        abstract = abstract.split(/(?=\n)/).inject(""){|s,i|
          s << i unless s.size > 500
          s
        }
      end
    end

    kw_re = /\bkeywords:?\b/i
    cat_re = /\bcategories:?\b/i
    acm_cat_re = /\b([A-K]\.(\d(\.\d)?)?)\b/
    kw_list_re = /(([^\.]+,)+[^\.\n]+)/m
    if str =~ cat_re
      cats = str.split(cat_re,2).last.
                      scan(acm_cat_re).
                      map{|hit| hit[0] }
    end
    if str =~ kw_re
      kws = str.split(kw_re)[1..-1].map{|kw|
                      kw.scan(kw_list_re).flatten.first
                    }.compact.
                    map{|hit| hit.split(/\s*,\s*/).map{|s|s.strip} }.
                    max{|a,b| a.length <=> b.length }
    end

#     cites = ReferenceGuesser.guess_references(text)

    guess['Doc.Title'] = title.strip.to_utf8 if title and title.strip.size < 100
    guess['Doc.Description'] = abstract.strip.to_utf8 if abstract
#     guess['Doc.Citations'] = cites if cites and not cites.empty?
    guess['Doc.Keywords'] = kws if kws and not kws.empty?
    if cats and not cats.empty?
      require 'metadata/acm_categories'
      guess['Doc.ACMCategories'] = cats.map{|cat|
        "#{cat.upcase} #{ACM_CATEGORIES[cat.upcase]}"
      }
    end
    guess = guess.merge(pubdata)
    guess
  end

  #TODO: Generate audio waveform
  def audio_x_flac(fn, charset)
    gem_require 'flacinfo'
    m = nil
    begin
      m = FlacInfo.new(fn)
    rescue # FlacInfo fails for flacs with id3 tags
      return audio(fn, charset)
    end
    t = m.tags
    si = m.streaminfo
    len = si["total_samples"].to_f / si["samplerate"]
    md = {
      'Audio.Codec' => 'FLAC',
      'Audio.Title' => enc_utf8(t['TITLE'], charset),
      'Audio.Artist' => enc_utf8(t['ARTIST'], charset),
      'Audio.Album' => enc_utf8(t['ALBUM'], charset),
      'Audio.Comment' => enc_utf8(t['COMMENT'], charset),
      'Audio.Bitrate' => File.size(fn)*8 / len,
      'Audio.Duration' => len,
      'Audio.Samplerate' => si["samplerate"],
      'Audio.VariableBitrate' => true,
      'Audio.Genre' => parse_genre(enc_utf8(t['GENRE'], charset)),
      'Audio.ReleaseDate' => parse_time(t['DATE']),
      'Audio.TrackNo' => parse_num(t['TRACKNUMBER'], :i),
      'Audio.Channels' => si["channels"]
    }
    ad = (audio(fn, charset) rescue {})
    ad.delete_if{|k,v| v.nil? }
    md.merge(ad)
  end
  
  #TODO: Generate audio waveform
  def audio_mp4(fn, charset)
    gem_require 'mp4info'
    m = MP4Info.open(fn)
    tn, total = m.TRKN
    md = {
      'Audio.Title' => enc_utf8(m.NAM, charset),
      'Audio.Artist' => enc_utf8(m.ART, charset),
      'Audio.Album' => enc_utf8(m.ALB, charset),
      'Audio.Bitrate' => m.BITRATE,
      'Audio.Duration' => m.SECS,
      'Audio.Samplerate' => m.FREQUENCY*1000,
      'Audio.VariableBitrate' => true,
      'Audio.Genre' => parse_genre(enc_utf8(m.GNRE, charset)),
      'Audio.ReleaseDate' => parse_time(m.DAY),
      'Audio.TrackNo' => parse_num(tn, :i),
      'Audio.AlbumTrackCount' => parse_num(total, :i),
      'Audio.Writer' => enc_utf8(m.WRT, charset),
      'Audio.Copyright' => enc_utf8(m.CPRT, charset),
      'Audio.Tempo' => parse_num(m.TMPO, :i),
      'Audio.Codec' => enc_utf8(m.ENCODING, charset),
      'Audio.AppleID' => enc_utf8(m.APID, charset),
      'Audio.Image' => base64(m.COVR),
    }
  end
  
  #TODO: Generate audio waveform
  def audio_x_ms_wma(fn, charset)
    gem_require 'wmainfo'
    # hack hack hacky workaround
    m = WmaInfo.allocate
    m.instance_variable_set("@ext_info", {})
    m.__send__(:initialize, fn)
    t = m.tags
    si = m.info
    md = {
      'Audio.Codec' => 'Windows Media Audio',
      'Audio.Title' => enc_utf8(t['Title'], charset),
      'Audio.Artist' => enc_utf8(t['Author'], charset),
      'Audio.Album' => enc_utf8(t['AlbumTitle'], charset),
      'Audio.AlbumArtist' => enc_utf8(t['AlbumArtist'], charset),
      'Audio.Bitrate' => si["bitrate"],
      'Audio.Duration' => si["playtime_seconds"],
      'Audio.Genre' => parse_genre(enc_utf8(t['Genre'], charset)),
      'Audio.ReleaseDate' => parse_time(t['Year']),
      'Audio.TrackNo' => parse_num(t['TrackNumber'], :i),
      'Audio.Copyright' => enc_utf8(t['Copyright'], charset),
      'Audio.VariableBitrate' => (si['IsVBR'] == 1)
    }
  end

  #TODO: Generate audio waveform
  def audio_x_ape(fn, charset)
    gem_require 'apetag'
    m = ApeTag.new(fn)
    t = m.fields
    ad = (id3lib_extract(fn, charset) rescue {})
    fields = %w(Title Artist Album Comment Genre Subtitle Publisher Conductor
       Composer Copyright Publicationright File EAN/UPC ISBN Catalog
       LC Media Index Related ISRC Abstract Language Bibliography
       Introplay Dummy) + ['Debut Album', 'Record Date', 'Record Location']
    md = {
      'Audio.ReleaseDate' => parse_time(t['Year']),
      'Audio.TrackNo' => parse_num(t['Track'], :i)
    }
    fields.each{|k| md["Audio.#{k.gsub(" ", "")}"] = t[k] }
    ad.delete_if{|k,v| v.nil? }
    md['Audio.Genre'] = parse_genre(md['Audio.Genre'])
    md.merge(ad)
  end
  alias_method :audio_x_musepack, :audio_x_ape
  alias_method :audio_x_wavepack, :audio_x_ape

  #TODO: Generate audio waveform
  def audio_mpeg(fn, charset)
    gem_require 'mp3info'
    h = audio(fn, charset)
    begin
      Mp3Info.open(fn){|mp3|
        h['Audio.Duration'] = mp3.length
        h['Audio.Bitrate'] = mp3.bitrate
        h['Audio.VariableBitrate'] = mp3.vbr
      }
    rescue => e
    end
    h
  end

  def application_pdf(filename, charset)
    h = pdfinfo_extract_info(filename)
    charset = nil
    secure_filename(filename){|tfn|
      charset = `pdftotext #{tfn} - | head -c 65536`.chardet
      h['words'] = `pdftotext #{tfn} - | wc -w 2>/dev/null`.strip.to_i
    }
    if h['keywords']
      keywords = h['keywords'].split(/[,.]/).map{|s| enc_utf8(s.strip, charset) }.find_all{|s| not s.empty? }
    end
    md = {
      'Doc.Title', enc_utf8(h['title'], charset),
      'Doc.Author', enc_utf8(h['author'], charset),
      'Doc.Created', parse_time(h['creationdate']),
      'Doc.Subject', enc_utf8(h['subject'], charset),
      'Doc.Modified', parse_time(h['moddate']),
      'Doc.PageCount', h['pages'],
      'Doc.Keywords', keywords,
      'Doc.PageSizeName', h['page_size'],
      'Doc.WordCount', h['words'],
      'Doc.Charset', charset,
      'Image.Width', parse_num(h['width'], :f),
      'Image.Height', parse_num(h['height'], :f),
      'Image.DimensionUnit', 'mm'
    }
    md.delete_if{|k,v| v.nil? }
    md
  end

  def application_postscript(filename, charset)
    extract_extract_info(filename)
  end
  alias_method :application_x_gzpostscript, :application_postscript

  def text_html(filename, charset)
    gem_require 'hpricot'
    words = secure_filename(filename){|tfn|
      `lynx -dump -display_charset=UTF-8 -nolist #{tfn} | wc -w 2>/dev/null`
    }.strip.to_i
    html = (File.read(filename, 65536) || "")
    charset = html.chardet
    h = {
      'Doc.WordCount' => words,
      'Doc.Charset' => charset
    }
    begin
      page = Hpricot.parse(html)
      te = (page / 'title')[0]
      if te
        title = enc_utf8(te.inner_text, charset)
        h['Doc.Title'] = title
      end
      tagstr = __get_meta(page, 'keywords', charset)
      if tagstr
        h['Doc.Keywords'] = tagstr.split(/\s*,\s*/)
      end
      h['Doc.Description'] = __get_meta(page, 'description', charset)
      h['Doc.Author'] = (__get_meta(page, 'author', charset) ||
                         __get_meta(page, 'dc.author', charset))
      h['Doc.Publisher'] = (__get_meta(page, 'publisher', charset) ||
                         __get_meta(page, 'dc.publisher', charset))
      h['Doc.Subject'] = (__get_meta(page, 'subject', charset) ||
                         __get_meta(page, 'dc.subject', charset))
      geopos = __get_meta(page, 'geo.position', charset)
      icbm = __get_meta(page, 'icbm', charset)
      if geopos
        latlon = geopos.strip.split(/\s*;\s*/).map{|n| n.to_f }
      elsif icbm
        latlon = icbm.strip.split(/\s*,\s*/).map{|n| n.to_f }
      end
      if latlon and latlon.size == 2
        h['Location.Latitude'] = latlon[0]
        h['Location.Longitude'] = latlon[1]
      end
    rescue
    end
    h
  end

  def __get_meta(page, name, charset=nil)
    tag = (page / 'meta').find{|e|
            e['name'].to_s.downcase == name.downcase }
    return enc_utf8(tag['content'].to_s, charset) if tag
    nil
  end

  def text(filename, charset)
    words = secure_filename(filename){|tfn| `wc -w #{tfn} 2>/dev/null` }.strip.to_i
    charset = (File.read(filename, 65536) || "").chardet
    {
      'Doc.WordCount' => words,
      'Doc.Charset' => charset
    }
  end
  
  #TODO: Generate audio waveform
  def audio(filename, charset)
    id3 = (id3lib_extract(filename, charset) rescue {})
    h = mplayer_extract_info(filename)
    info = {
      'Audio.Duration', (h['length'].to_i > 0) ? parse_num(h['length'], :f) : nil,
      'Audio.Bitrate', h['audio_bitrate'] && h['audio_bitrate'] != '0' ?
                       parse_num(h['audio_bitrate'], :i) / 1000.0 : nil,
      'Audio.Codec', enc_utf8(h['audio_format'], charset),
      'Audio.Samplerate', parse_num(h['audio_rate'], :i),
      'Audio.Channels', parse_num(h['audio_nch'], :i),

      'Audio.Title', enc_utf8(h['title'] || h['name'], charset),
      'Audio.Artist', enc_utf8(h['artist'] || h['author'], charset),
      'Audio.Album', enc_utf8(h['album'], charset),
      'Audio.ReleaseDate', parse_time(h['date'] || h['creation date'] || h['year']),
      'Audio.Comment', enc_utf8(h['comment'] || h['comments'], charset),
      'Audio.TrackNo', parse_num(h['track'], :i),
      'Audio.Copyright', enc_utf8(h['copyright'], charset),
      'Audio.Software', enc_utf8(h['software'], charset),
      'Audio.Genre', parse_genre(enc_utf8(h['genre'], charset))
    }
    id3.delete_if{|k,v| v.nil? }
    info.merge(id3)
  end

  #TODO: Generate audio waveform
  #TODO: Generate few frames for preview
  def video(filename, charset)
    id3 = (id3lib_extract(filename, charset) rescue {})
    h = mplayer_extract_info(filename)
    info = {
      'Image.Width', parse_num(h['video_width'], :f),
      'Image.Height', parse_num(h['video_height'], :f),
      'Image.DimensionUnit', 'px',
      'Video.Duration', (h['length'].to_i > 0) ? parse_num(h['length'], :f) : nil,
      'Video.Framerate', parse_num(h['video_fps'], :f),
      'Video.Bitrate', h['video_bitrate'] && h['video_bitrate'] != '0' ?
                       parse_num(h['video_bitrate'], :i) / 1000.0 : nil,
      'Video.Codec', enc_utf8(h['video_format'], charset),
      'Audio.Bitrate', h['audio_bitrate'] && h['audio_bitrate'] != '0' ?
                       parse_num(h['audio_bitrate'], :i) / 1000.0 : nil,
      'Audio.Codec', enc_utf8(h['audio_format'], charset),
      'Audio.Samplerate', parse_num(h['audio_rate'], :i),
      'Audio.Channels', parse_num(h['audio_nch'], :i),

      'Video.Title', enc_utf8(h['title'] || h['name'], charset),
      'Video.Artist', enc_utf8(h['artist'] || h['author'], charset),
      'Video.Album', enc_utf8(h['album'], charset),
      'Video.ReleaseDate', parse_time(h['date'] || h['creation date'] || h['year']),
      'Video.Comment', enc_utf8(h['comment'] || h['comments'], charset),
      'Video.TrackNo', parse_num(h['track'], :i),
      'Video.Genre', parse_genre(enc_utf8(h['genre'], charset)),
      'Video.Copyright', enc_utf8(h['copyright'], charset),
      'Video.Software', enc_utf8(h['software'], charset),
      'Video.Demuxer', enc_utf8(h['demuxer'], charset)
    }
    case h['demuxer']
    when 'avi'
      info['File.Format'] = 'video/x-msvideo'
    when 'mkv'
      info['File.Format'] = 'video/x-matroska'
    when 'mov'
      info['File.Format'] = 'video/quicktime'
    end
    id3.delete_if{|k,v| v.nil? }
    info.merge(id3)
  end
  alias_method('application_x_flash_video', 'video')

  def video_x_ms_wmv(filename, charset)
    h = video(filename, charset)
    wma = audio_x_ms_wma(filename, charset)
    %w(
      Bitrate Artist Title Album Genre ReleaseDate TrackNo VariableBitrate
    ).each{|t|
      h['Video.'+t] = wma['Audio.'+t]
    }
    %w(Samplerate Codec).each{|t|
      h['Audio.'+t] = wma['Audio.'+t]
    }
    h
  end
  alias_method('video_x_ms_asf', 'video_x_ms_wmv')

  #TODO: Generate image histogram
  def image(filename, charset)
    begin
      gem_require 'imlib2'
      img = Imlib2::Image.load(filename.to_s)
      w = img.width
      h = img.height
      id_out = ""
      img.delete!
    rescue Exception
      id_out = secure_filename(filename){|tfn| `identify #{tfn}` }
      w,h = id_out.scan(/[0-9]+x[0-9]+/)[0].split("x",2)
    end
    exif = (extract_exif(filename, charset) rescue {})
    info = {
      'Image.Width' => parse_num(w, :f),
      'Image.Height' => parse_num(h, :f),
      'Image.DimensionUnit' => 'px',
      'Image.LayerCount' => [id_out.split("\n").size, 1].max
    }.merge(exif)
    info
  end

  def image_svg_xml(filename, charset)
    id_out = secure_filename(filename){|tfn| `identify #{tfn}` }
    w,h = id_out.scan(/[0-9]+x[0-9]+/)[0].split("x",2)
    info = {
      'Image.Width' => parse_num(w, :f),
      'Image.Height' => parse_num(h, :f),
      'Image.DimensionUnit' => 'px'
    }
    info
  end

  def image_gif(filename, charset)
    id_out = secure_filename(filename){|tfn| `identify #{tfn}` }
    w,h = id_out.scan(/[0-9]+x[0-9]+/)[0].split("x",2)
    exif = (extract_exif(filename, charset) rescue {})
    info = {
      'Image.Width' => parse_num(w, :f),
      'Image.Height' => parse_num(h, :f),
      'Image.DimensionUnit' => 'px',
      'Image.FrameCount' => [id_out.split("\n").size, 1].max
    }.merge(exif)
    info
  end

  def image_x_dcraw(filename, charset)
    exif = (extract_exif(filename, charset) rescue {})
    dcraw = extract_dcraw(filename)
    info = {
      'Image.Frames' => 1,
      'Image.DimensionUnit' => 'px'
    }.merge(exif).merge(dcraw)
    info
  end

  def application_x_bittorrent(fn, charset)
    require 'metadata/bt'
    h = File.read(fn).bdecode
    charset ||= h['encoding']
    i = h['info']
    name = i['name.utf-8'] || enc_utf8(i['name'], charset)
    {
      'Doc.Title' => name,
      'BitTorrent.Name' => name,
      'BitTorrent.Files' =>
        if i['files']
          i['files'].map{|f|
            up = f['path.utf-8']
            up = up.join("/") if up.is_a?(Array)
            pt = f['path']
            pt = pt.join("/") if pt.is_a?(Array)
            fh = {"path" => (up || enc_utf8(pt, charset)),
             "length" => f['length']
            }
            fh['md5sum'] = f['md5sum'] if f['md5sum']
            fh
          }
        else
          nil
        end,
      'BitTorrent.Length' => i['length'],
      'BitTorrent.MD5Sum' => i['md5sum'],
      'BitTorrent.PieceLength' => i['piece length'],
      'BitTorrent.PieceCount' => i['pieces'].size / 20,

      'File.Software' => enc_utf8(h['created by'], charset),
      'Doc.Created' => parse_time(Time.at(h['creation date']).iso8601),
      'BitTorrent.Comment' => enc_utf8(h['comment'], charset),
      'BitTorrent.Announce' => enc_utf8(h['announce'], charset),
      'BitTorrent.AnnounceList' => h['announce-list'],
      'BitTorrent.Nodes' => h['nodes']
    }
  end


  def text__gettext(filename, charset, layout=false)
    enc_utf8((File.read(filename) || ""), charset)
  end

  def text_html__gettext(filename, charset, layout=false)
    enc_utf8(secure_filename(filename){|tfn| `lynx -dump -display_charset=UTF-8 -nolist #{tfn}` }, charset)
  end

  def application_pdf__gettext(filename, charset, layout=false)
    page = 0
    str = secure_filename(filename){|tfn| `pdftotext #{layout ? "-layout " : ""}-enc UTF-8 #{tfn} -` }
    if layout
      str.gsub!(/\f/u, "\f\n")
      str.gsub!(/^/u, " ")
      str.gsub!(/\A| ?\f/u) {|pg|
        "\nPage #{page+=1}.\n"
      }
      str.sub!(/\n+/, "")
      str.sub!(/1\./, "1.\n")
    end
    enc_utf8(str, "UTF-8")
  end

  def application_postscript__gettext(filename, charset, layout=false)
    page = 0
    str = secure_filename(filename){|tfn| `pstotext #{tfn}` }
    if layout
      str.gsub!(/\f/u, "\f\n")
      str.gsub!(/^/u, " ")
      str.gsub!(/\A| ?\f/u) {|pg|
        "\nPage #{page+=1}.\n"
      }
      str.sub!(/\n+/, "")
      str.sub!(/1\./, "1.\n")
    end
    enc_utf8(str, "ISO-8859-1") # pstotext outputs iso-8859-1
  end

  def application_x_gzpostscript__gettext(filename, charset, layout=false)
    page = 0
    str = secure_filename(filename){|tfn| `zcat #{tfn} | pstotext -` }
    if layout
      str.gsub!(/\f/u, "\f\n")
      str.gsub!(/^/u, " ")
      str.gsub!(/\A| ?\f/u) {|pg|
        "\nPage #{page+=1}.\n"
      }
      str.sub!(/\n+/, "")
      str.sub!(/1\./, "1.\n")
    end
    enc_utf8(str, "ISO-8859-1") # pstotext outputs iso-8859-1
  end

  def application_msword__gettext(filename, charset, layout=false)
    secure_filename(filename){|sfn| enc_utf8(`antiword #{sfn}`, charset) }
  end

  def application_rtf__gettext(filename, charset, layout=false)
    secure_filename(filename){|sfn| enc_utf8(`catdoc -d UTF-8 #{sfn}`, charset) }
  end

  def application_vnd_ms_powerpoint__gettext(filename, charset, layout=false)
    secure_filename(filename){|sfn| enc_utf8(`catppt -d UTF-8 #{sfn}`, charset) }
  end

  def application_vnd_ms_excel__gettext(filename, charset, layout=false)
    secure_filename(filename){|sfn| enc_utf8(`xls2csv -d UTF-8 #{sfn}`, charset) }
  end



  open_office_types = %w(
  application/vnd.oasis.opendocument.text
  application/vnd.oasis.opendocument.text-template
  application/vnd.oasis.opendocument.text-web
  application/vnd.oasis.opendocument.text-master
  application/vnd.oasis.opendocument.graphics
  application/vnd.oasis.opendocument.graphics-template
  application/vnd.oasis.opendocument.presentation
  application/vnd.oasis.opendocument.presentation-template
  application/vnd.oasis.opendocument.spreadsheet
  application/vnd.oasis.opendocument.spreadsheet-template
  application/vnd.oasis.opendocument.presentation
  application/vnd.oasis.opendocument.chart
  application/vnd.oasis.opendocument.formula
  application/vnd.oasis.opendocument.database

  application/vnd.sun.xml.writer
  application/vnd.sun.xml.writer.template
  application/vnd.sun.xml.calc
  application/vnd.sun.xml.calc.template
  application/vnd.sun.xml.impress
  application/vnd.sun.xml.impress.template
  application/vnd.sun.xml.writer.global
  application/vnd.sun.xml.math

  application/vnd.stardivision.writer
  application/vnd.stardivision.writer-global
  application/vnd.stardivision.calc
  application/vnd.stardivision.impress
  application/vnd.stardivision.impress-packed
  application/vnd.stardivision.math
  application/vnd.stardivision.chart
  application/vnd.stardivision.mail

  application/x-starwriter
  application/x-starcalc
  application/x-stardraw
  application/x-starimpress
  application/x-starmath
  application/x-starchart)

  office_types = %w(
  application/msword
  application/rtf
  application/vnd.openxmlformats-officedocument.presentationml.presentation
  application/vnd.openxmlformats-officedocument.wordprocessingml.document
  application/vnd.ms-word.document.macroenabled.12
  application/vnd.openxmlformats-officedocument.wordprocessingml.template
  application/vnd.ms-word.template.macroenabled.12
  application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
  application/vnd.ms-excel.sheet.macroenabled.12
  application/vnd.openxmlformats-officedocument.spreadsheetml.template
  application/vnd.ms-excel.template.macroenabled.12
  application/vnd.openxmlformats-officedocument.presentationml.presentation
  application/vnd.ms-powerpoint.presentation.macroenabled.12
  application/vnd.openxmlformats-officedocument.presentationml.template
  application/vnd.ms-powerpoint.template.macroenabled.12
  application/vnd.ms-excel.sheet.binary.macroenabled.12
  application/vnd.ms-word
  application/vnd.ms-excel
  application/vnd.ms-powerpoint
  )

  def self.create_text_extractor(mimetype, &block)
    major,minor = mimetype.to_s.gsub(/[^\/a-z0-9]/i,"_").split("/")
    mn = [major,minor,"_gettext"].join("_")
    define_method(mn, &block)
  end

  def self.create_info_extractor(mimetype, &block)
    major,minor = mimetype.to_s.gsub(/[^\/a-z0-9]/i,"_").split("/")
    mn = [major,minor].join("_")
    define_method(mn, &block)
  end

  (open_office_types).each{|t|
    create_text_extractor(t) do |filename, charset, layout|
      nil
    end
  }

  (open_office_types + office_types).each{|t|
    create_info_extractor(t) do |filename, charset|
      extract_extract_info(filename)
    end
  }

  def mplayer_extract_info(filename)
    mplayer = `which mplayer32 2>/dev/null`.strip
    mplayer = `which mplayer 2>/dev/null`.strip if mplayer.empty?
    mplayer = "mplayer" if mplayer.empty?
    output = IO.popen("#{mplayer.dump} -quiet -identify -vo null -ao null -frames 0 -playlist - 2>/dev/null", "r+"){|mp|
      mp.puts filename
      mp.close_write
      mp.read
    }
    ids = output.split("\n").grep(/^ID_/).map{|t|
      k,v, = t.split("=",2)
      k = k.downcase[3..-1]
      [k,v]
    }
    hash = Hash[*ids.flatten]
    hash.each{|k,v|
      if k =~ /^clip_info_name/
        hash[v.downcase] = hash[k.sub("name", "value")]
      end
    }
    f = { 
      '85' => 'MP3',
      'fLaC' => 'FLAC',
      'vrbs' => 'Vorbis',
      'hwac3' => 'AC3',
      '1' => 'PCM',
      '7' => 'Sun Audio',
      '353' => 'Windows Media Audio'
    }
    hash['audio_format'] = f[hash['audio_format']] if f[hash['audio_format']]
    hash
  end

  def extract_extract_info(filename)
    arr = secure_filename(filename){|tfn| `extract #{tfn}` }.strip.split("\n").map{|s| s.split(" - ",2) }
    h = arr.to_hash
    filenames = arr.find_all{|k,v| k == 'filename' }.map{|k,v| enc_utf8(v, nil) }
    keywords = arr.find_all{|k,v| k == 'keywords' }.map{|k,v| enc_utf8(v, nil) }
    revisions = arr.find_all{|k,v| k == 'revision history' }.map{|k,v| enc_utf8(v, nil) }
    md = {
      'Doc.Title', enc_utf8(h['title'], nil),
      'Doc.Subject', enc_utf8(h['subject'], nil),
      'Doc.Author', enc_utf8(h['creator'], nil),
      'Doc.LastSavedBy', enc_utf8(h['last saved by'], nil),

      'Doc.Language', enc_utf8(h['language'], nil),

      'Doc.Artist', enc_utf8(h['artist'], nil),
      'Doc.Genre', enc_utf8(h['genre'], nil),
      'Doc.Album', enc_utf8(h['album'], nil),
      'Doc.Language', enc_utf8(h['language'], nil),

      'Doc.Created', parse_time(h['creation date']),
      'Doc.Modified', parse_time(h['modification date'] || h['date']),
      'Doc.RevisionHistory', revisions.empty? ? nil : revisions,

      'Doc.Description', enc_utf8(h['description'], nil),
      'Doc.Keywords', keywords.empty? ? nil : keywords,

      'File.Software', enc_utf8(h['software'] || h['generator'], nil),
      'Doc.Template', enc_utf8(h['template'], nil),

      'Archive.Contents', filenames.empty? ? nil : filenames,

      'Doc.WordCount',      parse_num(h['word count'], :i),
      'Doc.PageCount',      parse_num(h['page count'], :i),
      'Doc.ParagraphCount', parse_num(h['paragraph count'], :i),
      'Doc.LineCount',      parse_num(h['line count'], :i),
      'Doc.CharacterCount', parse_num(h['character count'], :i)
    }
    md.delete_if{|k,v| v.nil? }
    md
  end

  def base64 s
    return nil if s.nil? || s.empty?
    return Base64.encode64(s)
  end

  def id3lib_extract(fn, charset)
    gem_require 'id3lib'
    t = ID3Lib::Tag.new(fn)
    time = t.year
    if t.date
      time = "#{time}-#{t.date[2,2]}-#{t.date[0,2]}"
    end
    unless charset
      ls = [t.title, t.artist, t.album, t.lyrics, t.comment].join
      charset = ls.chardet if ls and not ls.empty?
      charset = nil if charset =~ /ISO-8859|windows-1252/i
    end
    {
      'Audio.Title' => enc_utf8(t.title, charset),
      'Audio.Subtitle' => enc_utf8(t.subtitle, charset),

      'Audio.Artist' => enc_utf8(t.artist, charset),
      'Audio.Band' => enc_utf8(t.band, charset),
      'Audio.Composer' => enc_utf8(t.composer, charset),
      'Audio.Performer' => enc_utf8(t.performer, charset),
      'Audio.Conductor' => enc_utf8(t.conductor, charset),
      'Audio.Lyricist' => enc_utf8(t.lyricist, charset),
      'Audio.RemixedBy' => enc_utf8(t.remixed_by, charset),
      'Audio.InterpretedBy' => enc_utf8(t.interpreted_by, charset),

      'Audio.Genre' => parse_genre(enc_utf8(t.genre, charset)),
      'Audio.Grouping' => enc_utf8(t.grouping, charset),

      'Audio.Album' => enc_utf8(t.album, charset),
      'Audio.Publisher' => enc_utf8(t.publisher, charset),
      'Audio.ReleaseDate' => parse_time(time),
      'Audio.DiscNo' => parse_num(t.disc, :i),
      'Audio.TrackNo' => parse_num(t.track, :i),

      'Audio.Tempo' => parse_num(t.bpm, :i),
      'Audio.Comment' => enc_utf8(t.comment, charset),
      'Audio.Lyrics' => enc_utf8(t.lyrics, charset),
      'Audio.Image' => base64(t.find_all{|f| f[:id] == :APIC }.map{|f| f[:data] }[0])
    }
  end

  def extract_exif_tag(exif, filename, *tags)
    tag = tags.find{|t| exif[t] }
    value = exif[tag]
    if value and value =~ /\A\s*\(Binary data \d+ bytes, use -b option to extract\)\s*\Z/
      value = secure_filename(filename){|tfn|
        `exiftool -b -#{tag} #{tfn} 2>/dev/null`
      }
    end
    value
  end

  def extract_exif(filename, charset=nil)
    exif = {}
    raw_exif = secure_filename(filename){|tfn|
      `exiftool -s -t -c "%.6f" -d "%Y:%m:%dT%H:%M:%S%Z" #{tfn} 2>/dev/null`
    }.split("\n", 8).last
    raw_exif.strip.split("\n").each do |t|
      k,v = t.split("\t", 2)
      exif[k] = v
    end
    ex = lambda{|tags| enc_utf8( extract_exif_tag(exif, filename, *tags), charset ) }
    info = {
      'Image.Description' => ex[%w(ImageDescription Description Caption-Abstract Comment)],
      'Image.Creator' => ex[%w(Artist Creator By-line)],
      'Image.Editor' => ex[["Editor"]],
      'File.Software' => ex[["Software"]],
      'Image.OriginatingProgram' => ex[["OriginatingProgram"]],
      'Image.ExposureProgram' => ex[["ExposureProgram"]],
      'Image.Copyright' => ex[%w(Copyright CopyrightNotice CopyrightURL)],
      'Image.ISOSpeed' => parse_num(exif["ISO"], :f),
      'Image.Fnumber' => parse_num(exif["FNumber"], :f),
      'Image.Flash' => exif["FlashFired"] ?
                       enc_utf8(exif["FlashFired"], charset) == "True" : nil,
      'Image.FocalLength' => parse_num(exif["FocalLength"], :f),
      'Image.WhiteBalance' => ex[["WhiteBalance"]],
      'Image.CameraMake' => ex[['Make']],
      'Image.CameraModel' => ex[['Model']],
      'Image.Title' => ex[['Title']],
      'Image.ColorMode' => ex[['ColorMode']],
      'Image.ColorSpace' => ex[['ColorSpace']],

      'Image.EXIF' => enc_utf8(raw_exif, charset),

      'Location.Latitude' => parse_num(exif['GPSLatitude'], :f),
      'Location.Longitude' => parse_num(exif['GPSLongitude'], :f)
    }
    if exif["MeteringMode"]
      info['Image.MeteringMode'] = enc_utf8(exif["MeteringMode"].split(/[^a-z]/i).map{|s|s.capitalize}.join, charset)
    end
    if t = exif["ModifyDate"]
      info['Image.Date'] =
      info['Image.Modified'] = parse_time(t.split(":",3).join("-"))
    end
    if t = exif["DateCreated"]
      info['Image.Date'] =
      info['Image.DateCreated'] = parse_time(t.split(":",3).join("-"))
    end
    if t = exif["DateTimeCreated"]
      info['Image.Date'] =
      info['Image.DateTimeCreated'] = parse_time(t.split(":",3).join("-"))
    end
    info['Image.Date'] = info['Image.Date'].dup if info['Image.Date']
    if t = exif["DateTimeOriginal"]
      info['Image.DateTimeOriginal'] = parse_time(t.split(":",3).join("-"))
    end
    if exif['ExposureTime']
      d,n = exif['ExposureTime'].split("/")
      n ||= 1.0
      info['Image.ExposureTime'] = d.to_f / n.to_f
    end
    info
  end

  def extract_dcraw(filename)
    hash = {}
    secure_filename(filename){|tfn| `dcraw -i -v #{tfn}` }.strip.split("\n").
    each do |t|
      k,v = t.split(/:\s*/, 2)
      hash[k] = v
    end
    w, h = hash["Output size"].split("x",2).map{|s| parse_num(s.strip, :f) }
    t = hash
    info = {
      'Image.Width', w,
      'Image.Height', h,

      'Image.FilterPattern', t['Filter pattern'],
      'Image.FocalLength', parse_num(t['Focal length'], :f),
      'Image.ISOSpeed', parse_num(t['ISO speed'], :f),
      'Image.CameraModel', enc_utf8(t['Camera'], nil),
      'Image.ComponentCount', parse_num(t['Raw colors'], :i),
      'Image.Fnumber', parse_num(t['Aperture'], :f)
    }
    if t['Shutter']
      d,n = t['Shutter'].split("/")
      n ||= 1.0
      info['Image.ExposureTime'] = d.to_f / n.to_f
    end
    info
  end

  def pdfinfo_extract_info(filename)
    ids = secure_filename(filename){|tfn| `pdfinfo #{tfn}` }.strip.split("\n").
    map{|r|
      k,v = r.split(":",2)
      k = k.downcase
      v = parse_val(v.strip)
      [k,v]
    }
    i = Hash[*ids.flatten]
    if i['page size']
      w,h = i['page size'].gsub(/[^0-9.]/, ' ').strip.split(/\s+/,2)
      wmm = w.to_f.points_to_mm
      hmm = h.to_f.points_to_mm
      i['page_size'] = i['page size'].scan(/\(([^)]+)\)/).flatten[0]
      i['width'] = wmm
      i['height'] = hmm
      i['dimensions_unit'] = 'mm'
    end
    i
  end


  def citeseer_extract(title)
    require 'metadata/citeseer'
    h = CiteSeer.get_info(title)
    return h if h.empty?
    m = {}
    m['Doc.Title'] = h['title']
    m['Doc.Author'] = (h['creator'] || h['author'])
    m['Doc.Description'] = h['description']
    m['Doc.Publisher'] = h['publisher']
    m['Doc.Contributor'] = h['contributor']
    m['Doc.Subject'] = h['subject']
    m['Doc.Source'] = h['source'] || h['ee']
    m['Doc.CiteSeerURL'] = h['identifier']
    m['Doc.Language'] = h['language']
    m['Doc.Publication'] = h['book'] || h['booktitle'] || h['journal']
    m['Doc.PublicationPages'] = h['pages']
    m['Doc.Citations'] = h['citations']
    m['Doc.Published'] = parse_time(h['date'] || h['year'])
    m['Doc.CiteSeerIdentifier'] = h['bibtex_id']

    m.delete_if{|k,v| !v }
    m
  end

  def dblp_extract(title)
    require 'metadata/dblp'
    h = DBLP.get_info(title)
    return h if h.empty?
    m = {}
    m['Doc.Title'] = h['title']
    m['Doc.Author'] = h['author']
    m['Doc.Description'] = h['description']
    m['Doc.Publisher'] = h['publisher']
    m['Doc.Contributor'] = h['contributor']
    m['Doc.Subject'] = h['subject']
    m['Doc.Source'] = h['ee']
    m['Doc.CrossRef'] = h['crossref']
    m['Doc.BibSource'] = h['bibsource']
    m['Doc.Language'] = h['language']
    m['Doc.Publication'] = h['book'] || h['booktitle'] || h['journal']
    m['Doc.PublicationPages'] = h['pages']
    m['Doc.Published'] = parse_time(h['date'] || h['year'])
    m['Doc.BibTexType'] = h['bibtex_type']
    m['Doc.DBLPIdentifier'] = h['bibtex_id']

    m.delete_if{|k,v| !v }
    m
  end



  # Create a link to `filename' with a secure filename and yield it.
  # Unlinks secure filename after yield returns.
  #
  # This is needed because of filenames like "-h".
  #
  # If the filename doesn't begin with a dash, passes it in
  # double-quotes with double-quotes and dollar signs in
  # filename escaped.
  #
  def secure_filename(filename)
    require 'fileutils'
    if filename =~ /^-/
      dirname = File.dirname(File.expand_path(filename))
      tfn = "/tmp/" + temp_filename + (File.extname(filename) || "").
            gsub(/[^a-z0-9_.]/i, '_') # PAA RAA NOO IAA
      begin
        FileUtils.ln(filename, tfn)
      rescue
        FileUtils.cp(filename, tfn) # different fs for /tmp
      end
      yield(tfn)
    else # trust the filename to not blow up in our face
      yield(%Q("#{filename.gsub(/[$"]/, "\\\\\\0")}"))
    end
  ensure
    File.unlink(tfn) if tfn and File.exist?(tfn)
  end

  def temp_filename
    "metadata_temp_#{Process.pid}_#{Thread.current.object_id}_#{Time.now.to_f}"
  end

  def parse_val(v)
    case v
    when /^[0-9]+$/: v.to_i
    when /^[0-9]+(\.[0-9]+)?$/: v.to_f
    else
      v
    end
  end

  def enc_utf8(s, charset)
    return nil if s.nil? or s.empty?
    s.to_utf8(charset)
  end

  def parse_num(s, cast=nil)
    if s.is_a? Numeric
      return (
        case cast
        when :f
          s.to_f
        when :i
          s.to_i
        else
          s
        end
      )
    end
    return nil if s.nil? or s.empty? or not s.scan(/[0-9]+/)[0]
    case cast
    when :i
      num = nil
      s.sub(/[0-9]+/){|h| num = h }
      if num
        num.to_i
      else
        nil
      end
    when :f
      num = nil
      s.sub(/[0-9]+(\.[0-9]+(e[-+]?[0-9]+)?)?/i){|h| num = h }
      if num
        num.to_f
      else
        nil
      end
    else
      s.scan(/[0-9]+/)[0]
    end
  end

  def parse_time(s)
    return s if s.is_a?(DateTime)
    return nil if s.nil? or s.empty?
    DateTime.parse(s.to_s)
  rescue
    t = s.to_s.scan(/\d{4}/)[0]
    if t.nil?
      t = s.to_s.scan(/\d{2}/)[0]
      unless t.nil?
        y = Time.now.year.to_s
        t = "#{t.to_i > y[-2,2].to_i ? y[0,2].to_i-1 : y[0,2]}#{t}-01-01"
        DateTime.parse(t)
      else
        nil
      end
    else
      t += "-01-01"
      DateTime.parse(t)
    end
  end

  def parse_genre(s)
    gem_require 'id3lib'
    return nil if s.nil? or s.empty?
    return s unless s =~ /^\(\d+\)/
    genre_num = s.scan(/\d+/).first.to_i
    ID3Lib::Info::Genres[genre_num] || s
  end

  def remove_ligatures(s)
    return s unless s.is_a?(String)
    s.gsub("æ", 'ae').
      gsub("ä", 'ae').
      gsub("ö", 'oe').
      gsub("å", 'o').
      gsub("Æ", 'AE').
      gsub("œ", "ce").
      gsub("Œ", "CE").
      gsub("ŋ", "ng").
      gsub("Ŋ", "NG").
      gsub("ʩ", "fng").
      gsub("ﬀ", "ff").
      gsub("ﬁ", "fi").
      gsub("ﬂ", "fl").
      gsub("ﬃ", "ffi").
      gsub("ﬄ", "ffl").
      gsub("ﬅ", "ft").
      gsub("ﬆ", "st").
      gsub("ß", "ss")
  end

end