PageRenderTime 46ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/metadata/extract.rb

https://github.com/devilcoders/metadata
Ruby | 1451 lines | 1415 code | 26 blank | 10 comment | 31 complexity | ac9858eccdad6be785cf3212fca10920 MD5 | raw file
  1. require 'iconv'
  2. require 'pathname'
  3. require 'time'
  4. require 'date'
  5. require 'base64'
  6. require 'lib/metadata/mime_info'
  7. class Pathname
  8. attr_accessor 'mimetype'
  9. def mimetype
  10. @mimetype ||= MimeInfo.get(to_s)
  11. end
  12. def pages
  13. @pages ||= (metadata['Doc.PageCount'] || 1)
  14. end
  15. def dimensions
  16. @dimensions ||= [width, height]
  17. end
  18. def metadata(mime=mimetype, charset=nil, pdf=nil)
  19. @metadata ||= Metadata.extract(self, mime || mimetype, charset, pdf)
  20. end
  21. def length
  22. @length ||= (metadata['Audio.Duration'] || metadata['Video.Duration'] || metadata['Doc.WordCount'].to_i / 250.0)
  23. end
  24. def width
  25. metadata['Image.Width']
  26. end
  27. def height
  28. metadata['Image.Height']
  29. end
  30. def to_pn(*rest)
  31. pn = self
  32. pn = pn.join(*rest) unless rest.empty?
  33. pn
  34. end
  35. end
  36. class String
  37. def to_pn(*rest)
  38. pn = Pathname.new(self)
  39. pn = pn.join(*rest) unless rest.empty?
  40. pn
  41. end
  42. def chardet
  43. cset = IO.popen("chardet", "r+"){|cd|
  44. Thread.new {
  45. cd.write(self[0,65536])
  46. cd.close_write
  47. }
  48. # There's a chardet that outputs '<stdin>: ascii (Confidence: 1.00)',
  49. # we need to strip out the head and the tail.
  50. cd.read.strip.sub(/^[^:]*:\s*/,'').sub(/\s*\(.*/, '')
  51. }
  52. if cset == 'None'
  53. charsets = ['utf-8',
  54. 'utf-16', 'utf-16be', 'utf-32', 'utf-32be',
  55. 'shift-jis','euc-jp',
  56. 'iso8859-1','cp1252',
  57. 'big-5','gbk','gb18030','gb2312'].compact
  58. pk = $KCODE
  59. $KCODE = 'ascii'
  60. case self
  61. when /\A(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00)/
  62. charsets.unshift 'utf-32'
  63. when /\A(\xFE\xFF|\xFF\xFE)/
  64. charsets.unshift 'utf-16'
  65. when /\A\xEF\xBB\xBF/
  66. charsets.unshift 'utf-8'
  67. when /\A[a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]+\Z/m
  68. charsets.unshift 'ascii' unless self.include?("\000")
  69. end
  70. $KCODE = pk
  71. cset = charsets.find{|c|
  72. ((Iconv.iconv('utf-8', c, self)[0]) rescue false)
  73. }
  74. end
  75. if self.count("\000")*2 >= length and cset == 'ascii'
  76. cset = 'utf-16' + (self.index("\000") % 2 == 0 ? 'le' : 'be')
  77. end
  78. if cset =~ /windows-1255/i and self =~ /[a-z](\344|\366|\326|\304)[a-z]/
  79. cset = 'windows-1252'
  80. end
  81. cset
  82. end
  83. def to_utf8(charset=nil)
  84. us = nil
  85. charsets = [charset, 'utf-8',
  86. 'utf-16', 'utf-16be', 'utf-32', 'utf-32be',
  87. 'shift-jis','euc-jp',
  88. 'iso8859-1','cp1252',
  89. 'big-5','gbk','gb18030','gb2312']
  90. cd = chardet
  91. pk = $KCODE
  92. $KCODE = 'ascii'
  93. if cd
  94. case cd
  95. when /iso-8859|windows-1252/i
  96. na_re = /[^a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]/
  97. nl = gsub(na_re,'').length
  98. if length > 1.5 * nl
  99. charsets.insert(8, cd) # low ascii content
  100. else
  101. charsets.insert(2, cd) # high ascii content
  102. end
  103. when /utf/i
  104. charsets.insert(1, cd)
  105. else
  106. charsets.insert(2, cd)
  107. end
  108. end
  109. charsets.compact!
  110. case self
  111. when /\A(\x00\x00\xFE\xFF|\xFF\xFE\x00\x00)/
  112. charsets.unshift 'utf-32'
  113. bom = true
  114. when /\A(\xFE\xFF|\xFF\xFE)/
  115. charsets.unshift 'utf-16'
  116. bom = true
  117. when /\A\xEF\xBB\xBF/
  118. charsets.unshift 'utf-8'
  119. bom = true
  120. when /\A[a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]+\Z/m
  121. charsets.unshift 'ascii' unless self.include?("\000")
  122. when /\301\265|\220\333/
  123. charsets.unshift 'gbk'
  124. end
  125. $KCODE = pk
  126. cset = charsets.find{|c|
  127. ((us = Iconv.iconv('utf-8', c, self)[0]) rescue false)
  128. }
  129. if not bom
  130. if cset =~ /^utf-(16|32)(le|$)/i
  131. na_re = /[^a-zA-Z0-9_.:;,\{\}\(\)\\\/\[\]\n\t -]/
  132. if us.length > 1.9 * us.gsub(na_re,'').length
  133. rcset = cset.sub(/le|$/){|m| m == 'be' ? 'le' : 'be' }
  134. nus = ((Iconv.iconv('utf-8', rcset, self)[0]) rescue false)
  135. if nus and (nus.gsub(na_re,'').length > us.gsub(na_re,'').length)
  136. us = nus
  137. end
  138. end
  139. end
  140. end
  141. us ||= self.gsub(/[^0-9a-z._ '"\*\+\-]/,'?')
  142. us.sub!(/\A(\x00\x00\xFE\xFF|(\xFF\xFE(\x00\x00)?)|\xEF\xBB\xBF|\xFE\xFF)/, '') # strip UTF BOMs
  143. us.tr!("\0", "") # strip null bytes
  144. us
  145. end
  146. end
  147. class Array
  148. def to_hash
  149. h = {}
  150. each{|k,v| h[k] = v}
  151. h
  152. end
  153. end
  154. class Numeric
  155. def points_to_mm
  156. self * 0.3528
  157. end
  158. def mm_to_points
  159. self / 0.3528
  160. end
  161. end
  162. module Metadata
  163. extend self
  164. attr_accessor(:quiet, :verbose,
  165. :sha1sum, :md5sum,
  166. :no_text, :guess_title, :guess_metadata, :guess_pubdata,
  167. :use_citeseer, :use_dblp,
  168. :include_name, :include_path)
  169. # Extracts metadata from a file by guessing mimetype and calling matching
  170. # extractor methods (which mostly call external programs to do their bidding.)
  171. #
  172. # Metadata.extract('foo.png')
  173. #
  174. # Follows the Shared File Metadata Spec naming.
  175. # http://wiki.freedesktop.org/wiki/Specifications/shared-filemetadata-spec
  176. #
  177. # There are a couple flags that control the behaviour of
  178. # the metadata extractor:
  179. #
  180. # Metadata.sha1sum = true # include File.SHA1Sum in the metadata
  181. # Metadata.md5sum = true # include File.MD5Sum in the metadata
  182. # Metadata.include_name = true # include File.Name (file basename)
  183. # Metadata.include_path = true # include File.Path (file dirname)
  184. # Metadata.quiet = true # override verbose to false
  185. # Metadata.verbose = true # print out status messages to stderr
  186. #
  187. # All strings are converted to UTF-8.
  188. #
  189. def extract(filename, mimetype=MimeInfo.get(filename.to_s), charset=nil, pdf=nil)
  190. verbose = verbose && !quiet
  191. filename = filename.to_s
  192. mimetype = Mimetype[mimetype] unless mimetype.is_a?( Mimetype )
  193. unless File.exist?(filename)
  194. rv = {}
  195. if self.include_name
  196. rv['File.Name'] = enc_utf8(File.basename(filename), nil)
  197. end
  198. if self.include_path
  199. rv['File.Path'] = enc_utf8(File.dirname(filename), nil)
  200. end
  201. rv['File.Format'] ||= mimetype.to_s
  202. return rv
  203. end
  204. mts = mimetype.ancestors
  205. mt = mts.shift
  206. rv = nil
  207. new_methods = public_methods(false)
  208. #verbose message
  209. # TODO: send in json to browser
  210. STDERR.puts "Processing #{filename}", " Metadata extraction" if verbose
  211. while mt.is_a?(Mimetype) and mt != Mimetype
  212. #verbose message
  213. # TODO: send in json to browser
  214. STDERR.puts " Trying #{mt}" if verbose
  215. mn = mt.to_s.gsub(/[^a-z0-9]/i,"_")
  216. if new_methods.include?( mn )
  217. begin
  218. rv = __send__( mn, filename, charset )
  219. #verbose message
  220. # TODO: send in json to browser
  221. STDERR.puts " OK" if verbose
  222. break
  223. rescue => e
  224. #verbose message
  225. # TODO: send in json to browser
  226. STDERR.puts(e, e.message, e.backtrace) if verbose
  227. end
  228. end
  229. mt = mts.shift
  230. end
  231. unless rv
  232. #verbose message
  233. # TODO: send in json to browser
  234. STDERR.puts " Falling back to extract" if verbose
  235. rv = extract_extract_info(filename)
  236. end
  237. if self.include_name
  238. rv['File.Name'] = enc_utf8(File.basename(filename), nil)
  239. end
  240. if self.include_path
  241. rv['File.Path'] = enc_utf8(File.dirname(filename), nil)
  242. end
  243. rv['File.Format'] ||= mimetype.to_s
  244. if File.file?(filename)
  245. if self.sha1sum
  246. secure_filename(filename){|sfn|
  247. rv['File.SHA1Sum'] = `sha1sum #{sfn}`.split(" ",2)[0]
  248. }
  249. end
  250. if self.md5sum
  251. secure_filename(filename){|sfn|
  252. rv['File.MD5Sum'] = `md5sum #{sfn}`.split(" ",2)[0]
  253. }
  254. end
  255. end
  256. rv['File.Size'] = (
  257. if File.directory?(filename)
  258. Dir.entries(filename).size-2
  259. else
  260. File.size(filename)
  261. end)
  262. rv['File.Content'] = extract_text(filename, mimetype, charset, false) unless Metadata.no_text
  263. pdf ||= filename + "-temp.pdf"
  264. if File.exist?(pdf)
  265. pdf_metadata = application_pdf(pdf, charset)
  266. overrides = %w(Image.DimensionUnit Image.Width Image.Height Doc.PageCount
  267. Doc.PageSizeName)
  268. optrides = %w(Doc.WordCount Doc.Title Doc.Author)
  269. overrides.each{|o| rv[o] = pdf_metadata[o] }
  270. optrides.each {|o| rv[o] ||= pdf_metadata[o] }
  271. if !Metadata.no_text and not to_s =~ /postscript/
  272. rv['File.Content'] = extract_text(pdf, Mimetype['application/pdf'], charset, false)
  273. end
  274. end
  275. if guess_title or guess_metadata or guess_pubdata
  276. gem_require 'metadata/title_guesser'
  277. gem_require 'metadata/publication_guesser'
  278. gem_require 'metadata/reference_guesser'
  279. text = (rv['File.Content'] || extract_text(filename, mimetype, charset, false))
  280. guess = extract_guesses(text)
  281. if guess['Doc.Title'] and (rv['Doc.Title'].nil? or rv['Doc.Title'] =~ /(^[a-z])|((\.(dvi|doc)|WORD)$)|^Slide 1$|^PowerPoint Presentation$/)
  282. rv['Doc.Title'] = guess['Doc.Title']
  283. end
  284. end
  285. if use_citeseer and rv['Doc.Title'] and mimetype.to_s =~ /pdf|postscript|dvi|tex/
  286. rv.merge!(citeseer_extract(rv['Doc.Title']))
  287. end
  288. if use_dblp and rv['Doc.Title'] and mimetype.to_s =~ /pdf|postscript|dvi|tex/
  289. rv.merge!(dblp_extract(rv['Doc.Title']))
  290. end
  291. if guess_metadata or guess_pubdata
  292. %w(Doc.Publisher Doc.Published Doc.Publication Doc.Genre Event.Name Event.Organizer
  293. ).each{|field|
  294. rv[field] ||= guess[field] if guess[field]
  295. }
  296. end
  297. if guess_metadata
  298. %w(Doc.Citations Doc.Description Doc.ACMCategories Doc.Keywords
  299. ).each{|field|
  300. rv[field] ||= guess[field] if guess[field]
  301. }
  302. end
  303. rv['File.Modified'] = parse_time(File.mtime(filename.to_s).iso8601)
  304. rv.delete_if{|k,v| v.nil? }
  305. rv
  306. end
  307. # Extracts text from a file by guessing mimetype and calling matching
  308. # extractor methods (which mostly call external programs to do their bidding.)
  309. #
  310. # The extracted text is converted to UTF-8.
  311. #
  312. def extract_text(filename, mimetype=MimeInfo.get(filename.to_s), charset=nil, layout=false)
  313. filename = filename.to_s
  314. mimetype = Mimetype[mimetype] unless mimetype.is_a?( Mimetype )
  315. mts = mimetype.ancestors
  316. mt = mts.shift
  317. new_methods = public_methods(false)
  318. #verbose message
  319. # TODO: send in json to browser
  320. STDERR.puts " Text extraction" if verbose
  321. while mt.is_a?(Mimetype) and mt != Mimetype
  322. #verbose message
  323. # TODO: send in json to browser
  324. STDERR.puts " Trying #{mt}" if verbose
  325. mn = mt.to_s.gsub(/[^a-z0-9]/i,"_") + "__gettext"
  326. if new_methods.include?( mn )
  327. begin
  328. rv = __send__( mn, filename, charset, layout )
  329. #verbose message
  330. # TODO: send in json to browser
  331. STDERR.puts " OK" if verbose
  332. return rv
  333. rescue => e
  334. #verbose message
  335. # TODO: send in json to browser
  336. STDERR.puts(e, e.message, e.backtrace) unless quiet
  337. end
  338. end
  339. mt = mts.shift
  340. end
  341. #verbose message
  342. # TODO: send in json to browser
  343. STDERR.puts " Text extraction failed" if verbose
  344. nil
  345. end
  346. alias_method :[], 'extract'
  347. def gem_require(libname)
  348. retried = false
  349. begin
  350. require libname
  351. rescue LoadError
  352. unless retried
  353. #verbose message
  354. # TODO: send in json to browser
  355. STDERR.puts "Requiring rubygems" if verbose
  356. require 'rubygems'
  357. retried = true
  358. retry
  359. else
  360. raise
  361. end
  362. end
  363. end
  364. def extract_guesses(text)
  365. return {} unless text
  366. guess = {}
  367. title = TitleGuesser.guess_title(text)
  368. pubdata = PublicationGuesser.guess_pubdata(text)
  369. str = remove_ligatures(text).split(/\f+/m)[0,2].join("\n")
  370. abstract = str.scan(
  371. /\babstract\s*\n(.+)\n\s*((d+\.)|(\d\.?)*\s*(keywords|categories|introduction|(\d\.?)\s*[a-z]+))\s*\n/im
  372. ).flatten.first
  373. if abstract
  374. abstract.gsub!(/\A(\s*[a-z]+@([a-z]+\.)+[a-z]+\s*)+/im, '')
  375. if abstract.size > 500
  376. abstract = abstract.split(/(?=\n)/).inject(""){|s,i|
  377. s << i unless s.size > 500
  378. s
  379. }
  380. end
  381. end
  382. kw_re = /\bkeywords:?\b/i
  383. cat_re = /\bcategories:?\b/i
  384. acm_cat_re = /\b([A-K]\.(\d(\.\d)?)?)\b/
  385. kw_list_re = /(([^\.]+,)+[^\.\n]+)/m
  386. if str =~ cat_re
  387. cats = str.split(cat_re,2).last.
  388. scan(acm_cat_re).
  389. map{|hit| hit[0] }
  390. end
  391. if str =~ kw_re
  392. kws = str.split(kw_re)[1..-1].map{|kw|
  393. kw.scan(kw_list_re).flatten.first
  394. }.compact.
  395. map{|hit| hit.split(/\s*,\s*/).map{|s|s.strip} }.
  396. max{|a,b| a.length <=> b.length }
  397. end
  398. # cites = ReferenceGuesser.guess_references(text)
  399. guess['Doc.Title'] = title.strip.to_utf8 if title and title.strip.size < 100
  400. guess['Doc.Description'] = abstract.strip.to_utf8 if abstract
  401. # guess['Doc.Citations'] = cites if cites and not cites.empty?
  402. guess['Doc.Keywords'] = kws if kws and not kws.empty?
  403. if cats and not cats.empty?
  404. require 'metadata/acm_categories'
  405. guess['Doc.ACMCategories'] = cats.map{|cat|
  406. "#{cat.upcase} #{ACM_CATEGORIES[cat.upcase]}"
  407. }
  408. end
  409. guess = guess.merge(pubdata)
  410. guess
  411. end
  412. #TODO: Generate audio waveform
  413. def audio_x_flac(fn, charset)
  414. gem_require 'flacinfo'
  415. m = nil
  416. begin
  417. m = FlacInfo.new(fn)
  418. rescue # FlacInfo fails for flacs with id3 tags
  419. return audio(fn, charset)
  420. end
  421. t = m.tags
  422. si = m.streaminfo
  423. len = si["total_samples"].to_f / si["samplerate"]
  424. md = {
  425. 'Audio.Codec' => 'FLAC',
  426. 'Audio.Title' => enc_utf8(t['TITLE'], charset),
  427. 'Audio.Artist' => enc_utf8(t['ARTIST'], charset),
  428. 'Audio.Album' => enc_utf8(t['ALBUM'], charset),
  429. 'Audio.Comment' => enc_utf8(t['COMMENT'], charset),
  430. 'Audio.Bitrate' => File.size(fn)*8 / len,
  431. 'Audio.Duration' => len,
  432. 'Audio.Samplerate' => si["samplerate"],
  433. 'Audio.VariableBitrate' => true,
  434. 'Audio.Genre' => parse_genre(enc_utf8(t['GENRE'], charset)),
  435. 'Audio.ReleaseDate' => parse_time(t['DATE']),
  436. 'Audio.TrackNo' => parse_num(t['TRACKNUMBER'], :i),
  437. 'Audio.Channels' => si["channels"]
  438. }
  439. ad = (audio(fn, charset) rescue {})
  440. ad.delete_if{|k,v| v.nil? }
  441. md.merge(ad)
  442. end
  443. #TODO: Generate audio waveform
  444. def audio_mp4(fn, charset)
  445. gem_require 'mp4info'
  446. m = MP4Info.open(fn)
  447. tn, total = m.TRKN
  448. md = {
  449. 'Audio.Title' => enc_utf8(m.NAM, charset),
  450. 'Audio.Artist' => enc_utf8(m.ART, charset),
  451. 'Audio.Album' => enc_utf8(m.ALB, charset),
  452. 'Audio.Bitrate' => m.BITRATE,
  453. 'Audio.Duration' => m.SECS,
  454. 'Audio.Samplerate' => m.FREQUENCY*1000,
  455. 'Audio.VariableBitrate' => true,
  456. 'Audio.Genre' => parse_genre(enc_utf8(m.GNRE, charset)),
  457. 'Audio.ReleaseDate' => parse_time(m.DAY),
  458. 'Audio.TrackNo' => parse_num(tn, :i),
  459. 'Audio.AlbumTrackCount' => parse_num(total, :i),
  460. 'Audio.Writer' => enc_utf8(m.WRT, charset),
  461. 'Audio.Copyright' => enc_utf8(m.CPRT, charset),
  462. 'Audio.Tempo' => parse_num(m.TMPO, :i),
  463. 'Audio.Codec' => enc_utf8(m.ENCODING, charset),
  464. 'Audio.AppleID' => enc_utf8(m.APID, charset),
  465. 'Audio.Image' => base64(m.COVR),
  466. }
  467. end
  468. #TODO: Generate audio waveform
  469. def audio_x_ms_wma(fn, charset)
  470. gem_require 'wmainfo'
  471. # hack hack hacky workaround
  472. m = WmaInfo.allocate
  473. m.instance_variable_set("@ext_info", {})
  474. m.__send__(:initialize, fn)
  475. t = m.tags
  476. si = m.info
  477. md = {
  478. 'Audio.Codec' => 'Windows Media Audio',
  479. 'Audio.Title' => enc_utf8(t['Title'], charset),
  480. 'Audio.Artist' => enc_utf8(t['Author'], charset),
  481. 'Audio.Album' => enc_utf8(t['AlbumTitle'], charset),
  482. 'Audio.AlbumArtist' => enc_utf8(t['AlbumArtist'], charset),
  483. 'Audio.Bitrate' => si["bitrate"],
  484. 'Audio.Duration' => si["playtime_seconds"],
  485. 'Audio.Genre' => parse_genre(enc_utf8(t['Genre'], charset)),
  486. 'Audio.ReleaseDate' => parse_time(t['Year']),
  487. 'Audio.TrackNo' => parse_num(t['TrackNumber'], :i),
  488. 'Audio.Copyright' => enc_utf8(t['Copyright'], charset),
  489. 'Audio.VariableBitrate' => (si['IsVBR'] == 1)
  490. }
  491. end
  492. #TODO: Generate audio waveform
  493. def audio_x_ape(fn, charset)
  494. gem_require 'apetag'
  495. m = ApeTag.new(fn)
  496. t = m.fields
  497. ad = (id3lib_extract(fn, charset) rescue {})
  498. fields = %w(Title Artist Album Comment Genre Subtitle Publisher Conductor
  499. Composer Copyright Publicationright File EAN/UPC ISBN Catalog
  500. LC Media Index Related ISRC Abstract Language Bibliography
  501. Introplay Dummy) + ['Debut Album', 'Record Date', 'Record Location']
  502. md = {
  503. 'Audio.ReleaseDate' => parse_time(t['Year']),
  504. 'Audio.TrackNo' => parse_num(t['Track'], :i)
  505. }
  506. fields.each{|k| md["Audio.#{k.gsub(" ", "")}"] = t[k] }
  507. ad.delete_if{|k,v| v.nil? }
  508. md['Audio.Genre'] = parse_genre(md['Audio.Genre'])
  509. md.merge(ad)
  510. end
  511. alias_method :audio_x_musepack, :audio_x_ape
  512. alias_method :audio_x_wavepack, :audio_x_ape
  513. #TODO: Generate audio waveform
  514. def audio_mpeg(fn, charset)
  515. gem_require 'mp3info'
  516. h = audio(fn, charset)
  517. begin
  518. Mp3Info.open(fn){|mp3|
  519. h['Audio.Duration'] = mp3.length
  520. h['Audio.Bitrate'] = mp3.bitrate
  521. h['Audio.VariableBitrate'] = mp3.vbr
  522. }
  523. rescue => e
  524. end
  525. h
  526. end
  527. def application_pdf(filename, charset)
  528. h = pdfinfo_extract_info(filename)
  529. charset = nil
  530. secure_filename(filename){|tfn|
  531. charset = `pdftotext #{tfn} - | head -c 65536`.chardet
  532. h['words'] = `pdftotext #{tfn} - | wc -w 2>/dev/null`.strip.to_i
  533. }
  534. if h['keywords']
  535. keywords = h['keywords'].split(/[,.]/).map{|s| enc_utf8(s.strip, charset) }.find_all{|s| not s.empty? }
  536. end
  537. md = {
  538. 'Doc.Title', enc_utf8(h['title'], charset),
  539. 'Doc.Author', enc_utf8(h['author'], charset),
  540. 'Doc.Created', parse_time(h['creationdate']),
  541. 'Doc.Subject', enc_utf8(h['subject'], charset),
  542. 'Doc.Modified', parse_time(h['moddate']),
  543. 'Doc.PageCount', h['pages'],
  544. 'Doc.Keywords', keywords,
  545. 'Doc.PageSizeName', h['page_size'],
  546. 'Doc.WordCount', h['words'],
  547. 'Doc.Charset', charset,
  548. 'Image.Width', parse_num(h['width'], :f),
  549. 'Image.Height', parse_num(h['height'], :f),
  550. 'Image.DimensionUnit', 'mm'
  551. }
  552. md.delete_if{|k,v| v.nil? }
  553. md
  554. end
  555. def application_postscript(filename, charset)
  556. extract_extract_info(filename)
  557. end
  558. alias_method :application_x_gzpostscript, :application_postscript
  559. def text_html(filename, charset)
  560. gem_require 'hpricot'
  561. words = secure_filename(filename){|tfn|
  562. `lynx -dump -display_charset=UTF-8 -nolist #{tfn} | wc -w 2>/dev/null`
  563. }.strip.to_i
  564. html = (File.read(filename, 65536) || "")
  565. charset = html.chardet
  566. h = {
  567. 'Doc.WordCount' => words,
  568. 'Doc.Charset' => charset
  569. }
  570. begin
  571. page = Hpricot.parse(html)
  572. te = (page / 'title')[0]
  573. if te
  574. title = enc_utf8(te.inner_text, charset)
  575. h['Doc.Title'] = title
  576. end
  577. tagstr = __get_meta(page, 'keywords', charset)
  578. if tagstr
  579. h['Doc.Keywords'] = tagstr.split(/\s*,\s*/)
  580. end
  581. h['Doc.Description'] = __get_meta(page, 'description', charset)
  582. h['Doc.Author'] = (__get_meta(page, 'author', charset) ||
  583. __get_meta(page, 'dc.author', charset))
  584. h['Doc.Publisher'] = (__get_meta(page, 'publisher', charset) ||
  585. __get_meta(page, 'dc.publisher', charset))
  586. h['Doc.Subject'] = (__get_meta(page, 'subject', charset) ||
  587. __get_meta(page, 'dc.subject', charset))
  588. geopos = __get_meta(page, 'geo.position', charset)
  589. icbm = __get_meta(page, 'icbm', charset)
  590. if geopos
  591. latlon = geopos.strip.split(/\s*;\s*/).map{|n| n.to_f }
  592. elsif icbm
  593. latlon = icbm.strip.split(/\s*,\s*/).map{|n| n.to_f }
  594. end
  595. if latlon and latlon.size == 2
  596. h['Location.Latitude'] = latlon[0]
  597. h['Location.Longitude'] = latlon[1]
  598. end
  599. rescue
  600. end
  601. h
  602. end
  603. def __get_meta(page, name, charset=nil)
  604. tag = (page / 'meta').find{|e|
  605. e['name'].to_s.downcase == name.downcase }
  606. return enc_utf8(tag['content'].to_s, charset) if tag
  607. nil
  608. end
  609. def text(filename, charset)
  610. words = secure_filename(filename){|tfn| `wc -w #{tfn} 2>/dev/null` }.strip.to_i
  611. charset = (File.read(filename, 65536) || "").chardet
  612. {
  613. 'Doc.WordCount' => words,
  614. 'Doc.Charset' => charset
  615. }
  616. end
  617. #TODO: Generate audio waveform
  618. def audio(filename, charset)
  619. id3 = (id3lib_extract(filename, charset) rescue {})
  620. h = mplayer_extract_info(filename)
  621. info = {
  622. 'Audio.Duration', (h['length'].to_i > 0) ? parse_num(h['length'], :f) : nil,
  623. 'Audio.Bitrate', h['audio_bitrate'] && h['audio_bitrate'] != '0' ?
  624. parse_num(h['audio_bitrate'], :i) / 1000.0 : nil,
  625. 'Audio.Codec', enc_utf8(h['audio_format'], charset),
  626. 'Audio.Samplerate', parse_num(h['audio_rate'], :i),
  627. 'Audio.Channels', parse_num(h['audio_nch'], :i),
  628. 'Audio.Title', enc_utf8(h['title'] || h['name'], charset),
  629. 'Audio.Artist', enc_utf8(h['artist'] || h['author'], charset),
  630. 'Audio.Album', enc_utf8(h['album'], charset),
  631. 'Audio.ReleaseDate', parse_time(h['date'] || h['creation date'] || h['year']),
  632. 'Audio.Comment', enc_utf8(h['comment'] || h['comments'], charset),
  633. 'Audio.TrackNo', parse_num(h['track'], :i),
  634. 'Audio.Copyright', enc_utf8(h['copyright'], charset),
  635. 'Audio.Software', enc_utf8(h['software'], charset),
  636. 'Audio.Genre', parse_genre(enc_utf8(h['genre'], charset))
  637. }
  638. id3.delete_if{|k,v| v.nil? }
  639. info.merge(id3)
  640. end
  641. #TODO: Generate audio waveform
  642. #TODO: Generate few frames for preview
  643. def video(filename, charset)
  644. id3 = (id3lib_extract(filename, charset) rescue {})
  645. h = mplayer_extract_info(filename)
  646. info = {
  647. 'Image.Width', parse_num(h['video_width'], :f),
  648. 'Image.Height', parse_num(h['video_height'], :f),
  649. 'Image.DimensionUnit', 'px',
  650. 'Video.Duration', (h['length'].to_i > 0) ? parse_num(h['length'], :f) : nil,
  651. 'Video.Framerate', parse_num(h['video_fps'], :f),
  652. 'Video.Bitrate', h['video_bitrate'] && h['video_bitrate'] != '0' ?
  653. parse_num(h['video_bitrate'], :i) / 1000.0 : nil,
  654. 'Video.Codec', enc_utf8(h['video_format'], charset),
  655. 'Audio.Bitrate', h['audio_bitrate'] && h['audio_bitrate'] != '0' ?
  656. parse_num(h['audio_bitrate'], :i) / 1000.0 : nil,
  657. 'Audio.Codec', enc_utf8(h['audio_format'], charset),
  658. 'Audio.Samplerate', parse_num(h['audio_rate'], :i),
  659. 'Audio.Channels', parse_num(h['audio_nch'], :i),
  660. 'Video.Title', enc_utf8(h['title'] || h['name'], charset),
  661. 'Video.Artist', enc_utf8(h['artist'] || h['author'], charset),
  662. 'Video.Album', enc_utf8(h['album'], charset),
  663. 'Video.ReleaseDate', parse_time(h['date'] || h['creation date'] || h['year']),
  664. 'Video.Comment', enc_utf8(h['comment'] || h['comments'], charset),
  665. 'Video.TrackNo', parse_num(h['track'], :i),
  666. 'Video.Genre', parse_genre(enc_utf8(h['genre'], charset)),
  667. 'Video.Copyright', enc_utf8(h['copyright'], charset),
  668. 'Video.Software', enc_utf8(h['software'], charset),
  669. 'Video.Demuxer', enc_utf8(h['demuxer'], charset)
  670. }
  671. case h['demuxer']
  672. when 'avi'
  673. info['File.Format'] = 'video/x-msvideo'
  674. when 'mkv'
  675. info['File.Format'] = 'video/x-matroska'
  676. when 'mov'
  677. info['File.Format'] = 'video/quicktime'
  678. end
  679. id3.delete_if{|k,v| v.nil? }
  680. info.merge(id3)
  681. end
  682. alias_method('application_x_flash_video', 'video')
  683. def video_x_ms_wmv(filename, charset)
  684. h = video(filename, charset)
  685. wma = audio_x_ms_wma(filename, charset)
  686. %w(
  687. Bitrate Artist Title Album Genre ReleaseDate TrackNo VariableBitrate
  688. ).each{|t|
  689. h['Video.'+t] = wma['Audio.'+t]
  690. }
  691. %w(Samplerate Codec).each{|t|
  692. h['Audio.'+t] = wma['Audio.'+t]
  693. }
  694. h
  695. end
  696. alias_method('video_x_ms_asf', 'video_x_ms_wmv')
  697. #TODO: Generate image histogram
  698. def image(filename, charset)
  699. begin
  700. gem_require 'imlib2'
  701. img = Imlib2::Image.load(filename.to_s)
  702. w = img.width
  703. h = img.height
  704. id_out = ""
  705. img.delete!
  706. rescue Exception
  707. id_out = secure_filename(filename){|tfn| `identify #{tfn}` }
  708. w,h = id_out.scan(/[0-9]+x[0-9]+/)[0].split("x",2)
  709. end
  710. exif = (extract_exif(filename, charset) rescue {})
  711. info = {
  712. 'Image.Width' => parse_num(w, :f),
  713. 'Image.Height' => parse_num(h, :f),
  714. 'Image.DimensionUnit' => 'px',
  715. 'Image.LayerCount' => [id_out.split("\n").size, 1].max
  716. }.merge(exif)
  717. info
  718. end
  719. def image_svg_xml(filename, charset)
  720. id_out = secure_filename(filename){|tfn| `identify #{tfn}` }
  721. w,h = id_out.scan(/[0-9]+x[0-9]+/)[0].split("x",2)
  722. info = {
  723. 'Image.Width' => parse_num(w, :f),
  724. 'Image.Height' => parse_num(h, :f),
  725. 'Image.DimensionUnit' => 'px'
  726. }
  727. info
  728. end
  729. def image_gif(filename, charset)
  730. id_out = secure_filename(filename){|tfn| `identify #{tfn}` }
  731. w,h = id_out.scan(/[0-9]+x[0-9]+/)[0].split("x",2)
  732. exif = (extract_exif(filename, charset) rescue {})
  733. info = {
  734. 'Image.Width' => parse_num(w, :f),
  735. 'Image.Height' => parse_num(h, :f),
  736. 'Image.DimensionUnit' => 'px',
  737. 'Image.FrameCount' => [id_out.split("\n").size, 1].max
  738. }.merge(exif)
  739. info
  740. end
  741. def image_x_dcraw(filename, charset)
  742. exif = (extract_exif(filename, charset) rescue {})
  743. dcraw = extract_dcraw(filename)
  744. info = {
  745. 'Image.Frames' => 1,
  746. 'Image.DimensionUnit' => 'px'
  747. }.merge(exif).merge(dcraw)
  748. info
  749. end
  750. def application_x_bittorrent(fn, charset)
  751. require 'metadata/bt'
  752. h = File.read(fn).bdecode
  753. charset ||= h['encoding']
  754. i = h['info']
  755. name = i['name.utf-8'] || enc_utf8(i['name'], charset)
  756. {
  757. 'Doc.Title' => name,
  758. 'BitTorrent.Name' => name,
  759. 'BitTorrent.Files' =>
  760. if i['files']
  761. i['files'].map{|f|
  762. up = f['path.utf-8']
  763. up = up.join("/") if up.is_a?(Array)
  764. pt = f['path']
  765. pt = pt.join("/") if pt.is_a?(Array)
  766. fh = {"path" => (up || enc_utf8(pt, charset)),
  767. "length" => f['length']
  768. }
  769. fh['md5sum'] = f['md5sum'] if f['md5sum']
  770. fh
  771. }
  772. else
  773. nil
  774. end,
  775. 'BitTorrent.Length' => i['length'],
  776. 'BitTorrent.MD5Sum' => i['md5sum'],
  777. 'BitTorrent.PieceLength' => i['piece length'],
  778. 'BitTorrent.PieceCount' => i['pieces'].size / 20,
  779. 'File.Software' => enc_utf8(h['created by'], charset),
  780. 'Doc.Created' => parse_time(Time.at(h['creation date']).iso8601),
  781. 'BitTorrent.Comment' => enc_utf8(h['comment'], charset),
  782. 'BitTorrent.Announce' => enc_utf8(h['announce'], charset),
  783. 'BitTorrent.AnnounceList' => h['announce-list'],
  784. 'BitTorrent.Nodes' => h['nodes']
  785. }
  786. end
  787. def text__gettext(filename, charset, layout=false)
  788. enc_utf8((File.read(filename) || ""), charset)
  789. end
  790. def text_html__gettext(filename, charset, layout=false)
  791. enc_utf8(secure_filename(filename){|tfn| `lynx -dump -display_charset=UTF-8 -nolist #{tfn}` }, charset)
  792. end
  793. def application_pdf__gettext(filename, charset, layout=false)
  794. page = 0
  795. str = secure_filename(filename){|tfn| `pdftotext #{layout ? "-layout " : ""}-enc UTF-8 #{tfn} -` }
  796. if layout
  797. str.gsub!(/\f/u, "\f\n")
  798. str.gsub!(/^/u, " ")
  799. str.gsub!(/\A| ?\f/u) {|pg|
  800. "\nPage #{page+=1}.\n"
  801. }
  802. str.sub!(/\n+/, "")
  803. str.sub!(/1\./, "1.\n")
  804. end
  805. enc_utf8(str, "UTF-8")
  806. end
  807. def application_postscript__gettext(filename, charset, layout=false)
  808. page = 0
  809. str = secure_filename(filename){|tfn| `pstotext #{tfn}` }
  810. if layout
  811. str.gsub!(/\f/u, "\f\n")
  812. str.gsub!(/^/u, " ")
  813. str.gsub!(/\A| ?\f/u) {|pg|
  814. "\nPage #{page+=1}.\n"
  815. }
  816. str.sub!(/\n+/, "")
  817. str.sub!(/1\./, "1.\n")
  818. end
  819. enc_utf8(str, "ISO-8859-1") # pstotext outputs iso-8859-1
  820. end
  821. def application_x_gzpostscript__gettext(filename, charset, layout=false)
  822. page = 0
  823. str = secure_filename(filename){|tfn| `zcat #{tfn} | pstotext -` }
  824. if layout
  825. str.gsub!(/\f/u, "\f\n")
  826. str.gsub!(/^/u, " ")
  827. str.gsub!(/\A| ?\f/u) {|pg|
  828. "\nPage #{page+=1}.\n"
  829. }
  830. str.sub!(/\n+/, "")
  831. str.sub!(/1\./, "1.\n")
  832. end
  833. enc_utf8(str, "ISO-8859-1") # pstotext outputs iso-8859-1
  834. end
  835. def application_msword__gettext(filename, charset, layout=false)
  836. secure_filename(filename){|sfn| enc_utf8(`antiword #{sfn}`, charset) }
  837. end
  838. def application_rtf__gettext(filename, charset, layout=false)
  839. secure_filename(filename){|sfn| enc_utf8(`catdoc -d UTF-8 #{sfn}`, charset) }
  840. end
  841. def application_vnd_ms_powerpoint__gettext(filename, charset, layout=false)
  842. secure_filename(filename){|sfn| enc_utf8(`catppt -d UTF-8 #{sfn}`, charset) }
  843. end
  844. def application_vnd_ms_excel__gettext(filename, charset, layout=false)
  845. secure_filename(filename){|sfn| enc_utf8(`xls2csv -d UTF-8 #{sfn}`, charset) }
  846. end
  847. open_office_types = %w(
  848. application/vnd.oasis.opendocument.text
  849. application/vnd.oasis.opendocument.text-template
  850. application/vnd.oasis.opendocument.text-web
  851. application/vnd.oasis.opendocument.text-master
  852. application/vnd.oasis.opendocument.graphics
  853. application/vnd.oasis.opendocument.graphics-template
  854. application/vnd.oasis.opendocument.presentation
  855. application/vnd.oasis.opendocument.presentation-template
  856. application/vnd.oasis.opendocument.spreadsheet
  857. application/vnd.oasis.opendocument.spreadsheet-template
  858. application/vnd.oasis.opendocument.presentation
  859. application/vnd.oasis.opendocument.chart
  860. application/vnd.oasis.opendocument.formula
  861. application/vnd.oasis.opendocument.database
  862. application/vnd.sun.xml.writer
  863. application/vnd.sun.xml.writer.template
  864. application/vnd.sun.xml.calc
  865. application/vnd.sun.xml.calc.template
  866. application/vnd.sun.xml.impress
  867. application/vnd.sun.xml.impress.template
  868. application/vnd.sun.xml.writer.global
  869. application/vnd.sun.xml.math
  870. application/vnd.stardivision.writer
  871. application/vnd.stardivision.writer-global
  872. application/vnd.stardivision.calc
  873. application/vnd.stardivision.impress
  874. application/vnd.stardivision.impress-packed
  875. application/vnd.stardivision.math
  876. application/vnd.stardivision.chart
  877. application/vnd.stardivision.mail
  878. application/x-starwriter
  879. application/x-starcalc
  880. application/x-stardraw
  881. application/x-starimpress
  882. application/x-starmath
  883. application/x-starchart)
  884. office_types = %w(
  885. application/msword
  886. application/rtf
  887. application/vnd.openxmlformats-officedocument.presentationml.presentation
  888. application/vnd.openxmlformats-officedocument.wordprocessingml.document
  889. application/vnd.ms-word.document.macroenabled.12
  890. application/vnd.openxmlformats-officedocument.wordprocessingml.template
  891. application/vnd.ms-word.template.macroenabled.12
  892. application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
  893. application/vnd.ms-excel.sheet.macroenabled.12
  894. application/vnd.openxmlformats-officedocument.spreadsheetml.template
  895. application/vnd.ms-excel.template.macroenabled.12
  896. application/vnd.openxmlformats-officedocument.presentationml.presentation
  897. application/vnd.ms-powerpoint.presentation.macroenabled.12
  898. application/vnd.openxmlformats-officedocument.presentationml.template
  899. application/vnd.ms-powerpoint.template.macroenabled.12
  900. application/vnd.ms-excel.sheet.binary.macroenabled.12
  901. application/vnd.ms-word
  902. application/vnd.ms-excel
  903. application/vnd.ms-powerpoint
  904. )
  905. def self.create_text_extractor(mimetype, &block)
  906. major,minor = mimetype.to_s.gsub(/[^\/a-z0-9]/i,"_").split("/")
  907. mn = [major,minor,"_gettext"].join("_")
  908. define_method(mn, &block)
  909. end
  910. def self.create_info_extractor(mimetype, &block)
  911. major,minor = mimetype.to_s.gsub(/[^\/a-z0-9]/i,"_").split("/")
  912. mn = [major,minor].join("_")
  913. define_method(mn, &block)
  914. end
  915. (open_office_types).each{|t|
  916. create_text_extractor(t) do |filename, charset, layout|
  917. nil
  918. end
  919. }
  920. (open_office_types + office_types).each{|t|
  921. create_info_extractor(t) do |filename, charset|
  922. extract_extract_info(filename)
  923. end
  924. }
  925. def mplayer_extract_info(filename)
  926. mplayer = `which mplayer32 2>/dev/null`.strip
  927. mplayer = `which mplayer 2>/dev/null`.strip if mplayer.empty?
  928. mplayer = "mplayer" if mplayer.empty?
  929. output = IO.popen("#{mplayer.dump} -quiet -identify -vo null -ao null -frames 0 -playlist - 2>/dev/null", "r+"){|mp|
  930. mp.puts filename
  931. mp.close_write
  932. mp.read
  933. }
  934. ids = output.split("\n").grep(/^ID_/).map{|t|
  935. k,v, = t.split("=",2)
  936. k = k.downcase[3..-1]
  937. [k,v]
  938. }
  939. hash = Hash[*ids.flatten]
  940. hash.each{|k,v|
  941. if k =~ /^clip_info_name/
  942. hash[v.downcase] = hash[k.sub("name", "value")]
  943. end
  944. }
  945. f = {
  946. '85' => 'MP3',
  947. 'fLaC' => 'FLAC',
  948. 'vrbs' => 'Vorbis',
  949. 'hwac3' => 'AC3',
  950. '1' => 'PCM',
  951. '7' => 'Sun Audio',
  952. '353' => 'Windows Media Audio'
  953. }
  954. hash['audio_format'] = f[hash['audio_format']] if f[hash['audio_format']]
  955. hash
  956. end
  957. def extract_extract_info(filename)
  958. arr = secure_filename(filename){|tfn| `extract #{tfn}` }.strip.split("\n").map{|s| s.split(" - ",2) }
  959. h = arr.to_hash
  960. filenames = arr.find_all{|k,v| k == 'filename' }.map{|k,v| enc_utf8(v, nil) }
  961. keywords = arr.find_all{|k,v| k == 'keywords' }.map{|k,v| enc_utf8(v, nil) }
  962. revisions = arr.find_all{|k,v| k == 'revision history' }.map{|k,v| enc_utf8(v, nil) }
  963. md = {
  964. 'Doc.Title', enc_utf8(h['title'], nil),
  965. 'Doc.Subject', enc_utf8(h['subject'], nil),
  966. 'Doc.Author', enc_utf8(h['creator'], nil),
  967. 'Doc.LastSavedBy', enc_utf8(h['last saved by'], nil),
  968. 'Doc.Language', enc_utf8(h['language'], nil),
  969. 'Doc.Artist', enc_utf8(h['artist'], nil),
  970. 'Doc.Genre', enc_utf8(h['genre'], nil),
  971. 'Doc.Album', enc_utf8(h['album'], nil),
  972. 'Doc.Language', enc_utf8(h['language'], nil),
  973. 'Doc.Created', parse_time(h['creation date']),
  974. 'Doc.Modified', parse_time(h['modification date'] || h['date']),
  975. 'Doc.RevisionHistory', revisions.empty? ? nil : revisions,
  976. 'Doc.Description', enc_utf8(h['description'], nil),
  977. 'Doc.Keywords', keywords.empty? ? nil : keywords,
  978. 'File.Software', enc_utf8(h['software'] || h['generator'], nil),
  979. 'Doc.Template', enc_utf8(h['template'], nil),
  980. 'Archive.Contents', filenames.empty? ? nil : filenames,
  981. 'Doc.WordCount', parse_num(h['word count'], :i),
  982. 'Doc.PageCount', parse_num(h['page count'], :i),
  983. 'Doc.ParagraphCount', parse_num(h['paragraph count'], :i),
  984. 'Doc.LineCount', parse_num(h['line count'], :i),
  985. 'Doc.CharacterCount', parse_num(h['character count'], :i)
  986. }
  987. md.delete_if{|k,v| v.nil? }
  988. md
  989. end
  990. def base64 s
  991. return nil if s.nil? || s.empty?
  992. return Base64.encode64(s)
  993. end
  994. def id3lib_extract(fn, charset)
  995. gem_require 'id3lib'
  996. t = ID3Lib::Tag.new(fn)
  997. time = t.year
  998. if t.date
  999. time = "#{time}-#{t.date[2,2]}-#{t.date[0,2]}"
  1000. end
  1001. unless charset
  1002. ls = [t.title, t.artist, t.album, t.lyrics, t.comment].join
  1003. charset = ls.chardet if ls and not ls.empty?
  1004. charset = nil if charset =~ /ISO-8859|windows-1252/i
  1005. end
  1006. {
  1007. 'Audio.Title' => enc_utf8(t.title, charset),
  1008. 'Audio.Subtitle' => enc_utf8(t.subtitle, charset),
  1009. 'Audio.Artist' => enc_utf8(t.artist, charset),
  1010. 'Audio.Band' => enc_utf8(t.band, charset),
  1011. 'Audio.Composer' => enc_utf8(t.composer, charset),
  1012. 'Audio.Performer' => enc_utf8(t.performer, charset),
  1013. 'Audio.Conductor' => enc_utf8(t.conductor, charset),
  1014. 'Audio.Lyricist' => enc_utf8(t.lyricist, charset),
  1015. 'Audio.RemixedBy' => enc_utf8(t.remixed_by, charset),
  1016. 'Audio.InterpretedBy' => enc_utf8(t.interpreted_by, charset),
  1017. 'Audio.Genre' => parse_genre(enc_utf8(t.genre, charset)),
  1018. 'Audio.Grouping' => enc_utf8(t.grouping, charset),
  1019. 'Audio.Album' => enc_utf8(t.album, charset),
  1020. 'Audio.Publisher' => enc_utf8(t.publisher, charset),
  1021. 'Audio.ReleaseDate' => parse_time(time),
  1022. 'Audio.DiscNo' => parse_num(t.disc, :i),
  1023. 'Audio.TrackNo' => parse_num(t.track, :i),
  1024. 'Audio.Tempo' => parse_num(t.bpm, :i),
  1025. 'Audio.Comment' => enc_utf8(t.comment, charset),
  1026. 'Audio.Lyrics' => enc_utf8(t.lyrics, charset),
  1027. 'Audio.Image' => base64(t.find_all{|f| f[:id] == :APIC }.map{|f| f[:data] }[0])
  1028. }
  1029. end
  1030. def extract_exif_tag(exif, filename, *tags)
  1031. tag = tags.find{|t| exif[t] }
  1032. value = exif[tag]
  1033. if value and value =~ /\A\s*\(Binary data \d+ bytes, use -b option to extract\)\s*\Z/
  1034. value = secure_filename(filename){|tfn|
  1035. `exiftool -b -#{tag} #{tfn} 2>/dev/null`
  1036. }
  1037. end
  1038. value
  1039. end
  1040. def extract_exif(filename, charset=nil)
  1041. exif = {}
  1042. raw_exif = secure_filename(filename){|tfn|
  1043. `exiftool -s -t -c "%.6f" -d "%Y:%m:%dT%H:%M:%S%Z" #{tfn} 2>/dev/null`
  1044. }.split("\n", 8).last
  1045. raw_exif.strip.split("\n").each do |t|
  1046. k,v = t.split("\t", 2)
  1047. exif[k] = v
  1048. end
  1049. ex = lambda{|tags| enc_utf8( extract_exif_tag(exif, filename, *tags), charset ) }
  1050. info = {
  1051. 'Image.Description' => ex[%w(ImageDescription Description Caption-Abstract Comment)],
  1052. 'Image.Creator' => ex[%w(Artist Creator By-line)],
  1053. 'Image.Editor' => ex[["Editor"]],
  1054. 'File.Software' => ex[["Software"]],
  1055. 'Image.OriginatingProgram' => ex[["OriginatingProgram"]],
  1056. 'Image.ExposureProgram' => ex[["ExposureProgram"]],
  1057. 'Image.Copyright' => ex[%w(Copyright CopyrightNotice CopyrightURL)],
  1058. 'Image.ISOSpeed' => parse_num(exif["ISO"], :f),
  1059. 'Image.Fnumber' => parse_num(exif["FNumber"], :f),
  1060. 'Image.Flash' => exif["FlashFired"] ?
  1061. enc_utf8(exif["FlashFired"], charset) == "True" : nil,
  1062. 'Image.FocalLength' => parse_num(exif["FocalLength"], :f),
  1063. 'Image.WhiteBalance' => ex[["WhiteBalance"]],
  1064. 'Image.CameraMake' => ex[['Make']],
  1065. 'Image.CameraModel' => ex[['Model']],
  1066. 'Image.Title' => ex[['Title']],
  1067. 'Image.ColorMode' => ex[['ColorMode']],
  1068. 'Image.ColorSpace' => ex[['ColorSpace']],
  1069. 'Image.EXIF' => enc_utf8(raw_exif, charset),
  1070. 'Location.Latitude' => parse_num(exif['GPSLatitude'], :f),
  1071. 'Location.Longitude' => parse_num(exif['GPSLongitude'], :f)
  1072. }
  1073. if exif["MeteringMode"]
  1074. info['Image.MeteringMode'] = enc_utf8(exif["MeteringMode"].split(/[^a-z]/i).map{|s|s.capitalize}.join, charset)
  1075. end
  1076. if t = exif["ModifyDate"]
  1077. info['Image.Date'] =
  1078. info['Image.Modified'] = parse_time(t.split(":",3).join("-"))
  1079. end
  1080. if t = exif["DateCreated"]
  1081. info['Image.Date'] =
  1082. info['Image.DateCreated'] = parse_time(t.split(":",3).join("-"))
  1083. end
  1084. if t = exif["DateTimeCreated"]
  1085. info['Image.Date'] =
  1086. info['Image.DateTimeCreated'] = parse_time(t.split(":",3).join("-"))
  1087. end
  1088. info['Image.Date'] = info['Image.Date'].dup if info['Image.Date']
  1089. if t = exif["DateTimeOriginal"]
  1090. info['Image.DateTimeOriginal'] = parse_time(t.split(":",3).join("-"))
  1091. end
  1092. if exif['ExposureTime']
  1093. d,n = exif['ExposureTime'].split("/")
  1094. n ||= 1.0
  1095. info['Image.ExposureTime'] = d.to_f / n.to_f
  1096. end
  1097. info
  1098. end
  1099. def extract_dcraw(filename)
  1100. hash = {}
  1101. secure_filename(filename){|tfn| `dcraw -i -v #{tfn}` }.strip.split("\n").
  1102. each do |t|
  1103. k,v = t.split(/:\s*/, 2)
  1104. hash[k] = v
  1105. end
  1106. w, h = hash["Output size"].split("x",2).map{|s| parse_num(s.strip, :f) }
  1107. t = hash
  1108. info = {
  1109. 'Image.Width', w,
  1110. 'Image.Height', h,
  1111. 'Image.FilterPattern', t['Filter pattern'],
  1112. 'Image.FocalLength', parse_num(t['Focal length'], :f),
  1113. 'Image.ISOSpeed', parse_num(t['ISO speed'], :f),
  1114. 'Image.CameraModel', enc_utf8(t['Camera'], nil),
  1115. 'Image.ComponentCount', parse_num(t['Raw colors'], :i),
  1116. 'Image.Fnumber', parse_num(t['Aperture'], :f)
  1117. }
  1118. if t['Shutter']
  1119. d,n = t['Shutter'].split("/")
  1120. n ||= 1.0
  1121. info['Image.ExposureTime'] = d.to_f / n.to_f
  1122. end
  1123. info
  1124. end
  1125. def pdfinfo_extract_info(filename)
  1126. ids = secure_filename(filename){|tfn| `pdfinfo #{tfn}` }.strip.split("\n").
  1127. map{|r|
  1128. k,v = r.split(":",2)
  1129. k = k.downcase
  1130. v = parse_val(v.strip)
  1131. [k,v]
  1132. }
  1133. i = Hash[*ids.flatten]
  1134. if i['page size']
  1135. w,h = i['page size'].gsub(/[^0-9.]/, ' ').strip.split(/\s+/,2)
  1136. wmm = w.to_f.points_to_mm
  1137. hmm = h.to_f.points_to_mm
  1138. i['page_size'] = i['page size'].scan(/\(([^)]+)\)/).flatten[0]
  1139. i['width'] = wmm
  1140. i['height'] = hmm
  1141. i['dimensions_unit'] = 'mm'
  1142. end
  1143. i
  1144. end
  1145. def citeseer_extract(title)
  1146. require 'metadata/citeseer'
  1147. h = CiteSeer.get_info(title)
  1148. return h if h.empty?
  1149. m = {}
  1150. m['Doc.Title'] = h['title']
  1151. m['Doc.Author'] = (h['creator'] || h['author'])
  1152. m['Doc.Description'] = h['description']
  1153. m['Doc.Publisher'] = h['publisher']
  1154. m['Doc.Contributor'] = h['contributor']
  1155. m['Doc.Subject'] = h['subject']
  1156. m['Doc.Source'] = h['source'] || h['ee']
  1157. m['Doc.CiteSeerURL'] = h['identifier']
  1158. m['Doc.Language'] = h['language']
  1159. m['Doc.Publication'] = h['book'] || h['booktitle'] || h['journal']
  1160. m['Doc.PublicationPages'] = h['pages']
  1161. m['Doc.Citations'] = h['citations']
  1162. m['Doc.Published'] = parse_time(h['date'] || h['year'])
  1163. m['Doc.CiteSeerIdentifier'] = h['bibtex_id']
  1164. m.delete_if{|k,v| !v }
  1165. m
  1166. end
  1167. def dblp_extract(title)
  1168. require 'metadata/dblp'
  1169. h = DBLP.get_info(title)
  1170. return h if h.empty?
  1171. m = {}
  1172. m['Doc.Title'] = h['title']
  1173. m['Doc.Author'] = h['author']
  1174. m['Doc.Description'] = h['description']
  1175. m['Doc.Publisher'] = h['publisher']
  1176. m['Doc.Contributor'] = h['contributor']
  1177. m['Doc.Subject'] = h['subject']
  1178. m['Doc.Source'] = h['ee']
  1179. m['Doc.CrossRef'] = h['crossref']
  1180. m['Doc.BibSource'] = h['bibsource']
  1181. m['Doc.Language'] = h['language']
  1182. m['Doc.Publication'] = h['book'] || h['booktitle'] || h['journal']
  1183. m['Doc.PublicationPages'] = h['pages']
  1184. m['Doc.Published'] = parse_time(h['date'] || h['year'])
  1185. m['Doc.BibTexType'] = h['bibtex_type']
  1186. m['Doc.DBLPIdentifier'] = h['bibtex_id']
  1187. m.delete_if{|k,v| !v }
  1188. m
  1189. end
  1190. # Create a link to `filename' with a secure filename and yield it.
  1191. # Unlinks secure filename after yield returns.
  1192. #
  1193. # This is needed because of filenames like "-h".
  1194. #
  1195. # If the filename doesn't begin with a dash, passes it in
  1196. # double-quotes with double-quotes and dollar signs in
  1197. # filename escaped.
  1198. #
  1199. def secure_filename(filename)
  1200. require 'fileutils'
  1201. if filename =~ /^-/
  1202. dirname = File.dirname(File.expand_path(filename))
  1203. tfn = "/tmp/" + temp_filename + (File.extname(filename) || "").
  1204. gsub(/[^a-z0-9_.]/i, '_') # PAA RAA NOO IAA
  1205. begin
  1206. FileUtils.ln(filename, tfn)
  1207. rescue
  1208. FileUtils.cp(filename, tfn) # different fs for /tmp
  1209. end
  1210. yield(tfn)
  1211. else # trust the filename to not blow up in our face
  1212. yield(%Q("#{filename.gsub(/[$"]/, "\\\\\\0")}"))
  1213. end
  1214. ensure
  1215. File.unlink(tfn) if tfn and File.exist?(tfn)
  1216. end
  1217. def temp_filename
  1218. "metadata_temp_#{Process.pid}_#{Thread.current.object_id}_#{Time.now.to_f}"
  1219. end
  1220. def parse_val(v)
  1221. case v
  1222. when /^[0-9]+$/: v.to_i
  1223. when /^[0-9]+(\.[0-9]+)?$/: v.to_f
  1224. else
  1225. v
  1226. end
  1227. end
  1228. def enc_utf8(s, charset)
  1229. return nil if s.nil? or s.empty?
  1230. s.to_utf8(charset)
  1231. end
  1232. def parse_num(s, cast=nil)
  1233. if s.is_a? Numeric
  1234. return (
  1235. case cast
  1236. when :f
  1237. s.to_f
  1238. when :i
  1239. s.to_i
  1240. else
  1241. s
  1242. end
  1243. )
  1244. end
  1245. return nil if s.nil? or s.empty? or not s.scan(/[0-9]+/)[0]
  1246. case cast
  1247. when :i
  1248. num = nil
  1249. s.sub(/[0-9]+/){|h| num = h }
  1250. if num
  1251. num.to_i
  1252. else
  1253. nil
  1254. end
  1255. when :f
  1256. num = nil
  1257. s.sub(/[0-9]+(\.[0-9]+(e[-+]?[0-9]+)?)?/i){|h| num = h }
  1258. if num
  1259. num.to_f
  1260. else
  1261. nil
  1262. end
  1263. else
  1264. s.scan(/[0-9]+/)[0]
  1265. end
  1266. end
  1267. def parse_time(s)
  1268. return s if s.is_a?(DateTime)
  1269. return nil if s.nil? or s.empty?
  1270. DateTime.parse(s.to_s)
  1271. rescue
  1272. t = s.to_s.scan(/\d{4}/)[0]
  1273. if t.nil?
  1274. t = s.to_s.scan(/\d{2}/)[0]
  1275. unless t.nil?
  1276. y = Time.now.year.to_s
  1277. t = "#{t.to_i > y[-2,2].to_i ? y[0,2].to_i-1 : y[0,2]}#{t}-01-01"
  1278. DateTime.parse(t)
  1279. else
  1280. nil
  1281. end
  1282. else
  1283. t += "-01-01"
  1284. DateTime.parse(t)
  1285. end
  1286. end
  1287. def parse_genre(s)
  1288. gem_require 'id3lib'
  1289. return nil if s.nil? or s.empty?
  1290. return s unless s =~ /^\(\d+\)/
  1291. genre_num = s.scan(/\d+/).first.to_i
  1292. ID3Lib::Info::Genres[genre_num] || s
  1293. end
  1294. def remove_ligatures(s)
  1295. return s unless s.is_a?(String)
  1296. s.gsub("æ", 'ae').
  1297. gsub("ä", 'ae').
  1298. gsub("ö", 'oe').
  1299. gsub("å", 'o').
  1300. gsub("Æ", 'AE').
  1301. gsub("œ", "ce").
  1302. gsub("Œ", "CE").
  1303. gsub("ŋ", "ng").
  1304. gsub("Ŋ", "NG").
  1305. gsub("ʩ", "fng").
  1306. gsub("ff", "ff").
  1307. gsub("fi", "fi").
  1308. gsub("fl", "fl").
  1309. gsub("ffi", "ffi").
  1310. gsub("ffl", "ffl").
  1311. gsub("ſt", "ft").
  1312. gsub("st", "st").
  1313. gsub("ß", "ss")
  1314. end
  1315. end