PageRenderTime 44ms CodeModel.GetById 16ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/include/assignment_set.rb

http://crowdmos.codeplex.com
Ruby | 604 lines | 482 code | 96 blank | 26 comment | 56 complexity | 87df580ec3d0bfcb2ef323e2478124c2 MD5 | raw file
Possible License(s): Apache-2.0
  1. #!/usr/bin/env ruby
  2. # Copyright (c) 2010-2012 Microsoft Corp.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. require 'rubygems'
  16. require 'ruby-aws'
  17. require 'matrix'
  18. require './include/array'
  19. module MOS
  20. class AssignmentSet
  21. attr_reader :stats, :assignment_list
  22. def initialize(assignment_list)
  23. @assignment_list = assignment_list
  24. rebuild_stats
  25. end
  26. def rebuild_stats
  27. init_stats
  28. build_stats(@stats_sentences, @algorithms, @sentences, @workers)
  29. build_stats(@stats_workers, @algorithms, @workers, @sentences)
  30. compute_algorithm_ci95
  31. end
  32. # 95th percentile from a t-distribution with n degrees of freedom
  33. # determined with MATLAB using:
  34. # >> n = (1:100)';
  35. # >> t = tinv(.5*(1 + .95), n);
  36. def t95(n)
  37. t = [12.7062, 4.3027, 3.1824, 2.7764, 2.5706, 2.4469, 2.3646, 2.3060, 2.2622, 2.2281,
  38. 2.2010, 2.1788, 2.1604, 2.1448, 2.1314, 2.1199, 2.1098, 2.1009, 2.0930, 2.0860,
  39. 2.0796, 2.0739, 2.0687, 2.0639, 2.0595, 2.0555, 2.0518, 2.0484, 2.0452, 2.0423,
  40. 2.0395, 2.0369, 2.0345, 2.0322, 2.0301, 2.0281, 2.0262, 2.0244, 2.0227, 2.0211,
  41. 2.0195, 2.0181, 2.0167, 2.0154, 2.0141, 2.0129, 2.0117, 2.0106, 2.0096, 2.0086,
  42. 2.0076, 2.0066, 2.0057, 2.0049, 2.0040, 2.0032, 2.0025, 2.0017, 2.0010, 2.0003,
  43. 1.9996, 1.9990, 1.9983, 1.9977, 1.9971, 1.9966, 1.9960, 1.9955, 1.9949, 1.9944,
  44. 1.9939, 1.9935, 1.9930, 1.9925, 1.9921, 1.9917, 1.9913, 1.9908, 1.9905, 1.9901,
  45. 1.9897, 1.9893, 1.9890, 1.9886, 1.9883, 1.9879, 1.9876, 1.9873, 1.9870, 1.9867,
  46. 1.9864, 1.9861, 1.9858, 1.9855, 1.9853, 1.9850, 1.9847, 1.9845, 1.9842, 1.9840];
  47. # if n is not on the table, use a Gaussian approximation (for n -> inf)
  48. return (n < t.length) ? t[n] : 1.96;
  49. end
  50. def stats_from_values(node)
  51. values = node[:values]
  52. stats = (node[:stats] ||= {})
  53. stats[:count] = values.length
  54. stats[:mean] = values.mean
  55. stats[:min] = values.min
  56. stats[:max] = values.max
  57. stats[:var] = values.var
  58. stats[:std_dev] = values.std_dev
  59. stats[:kurtosis] = values.kurtosis
  60. if stats[:count] > 1
  61. t = t95(stats[:count] - 1)
  62. stats[:ci] = t * Math.sqrt(stats[:var] / stats[:count])
  63. end
  64. end
  65. def compute_algorithm_ci95
  66. @algorithms.each do |algorithm|
  67. next if !@stats_sentences[algorithm]
  68. t = 0
  69. mi = []
  70. nj = []
  71. v_wu = []
  72. @sentences.each do |sentence|
  73. next if !@stats_sentences[algorithm][sentence]
  74. if @stats_sentences[algorithm][sentence][:stats][:count] >= 2
  75. v_wu << @stats_sentences[algorithm][sentence][:stats][:var]
  76. mi << @stats_sentences[algorithm][sentence][:stats][:count]
  77. end
  78. end
  79. v_wu = v_wu.mean
  80. v_su = []
  81. @workers.each do |worker|
  82. next if !@stats_workers[algorithm][worker]
  83. if @stats_workers[algorithm][worker][:stats][:count] >= 2
  84. v_su << @stats_workers[algorithm][worker][:stats][:var]
  85. nj << @stats_workers[algorithm][worker][:stats][:count]
  86. end
  87. end
  88. v_su = v_su.mean
  89. v_swu = []
  90. @sentences.each do |sentence|
  91. next if !@stats_sentences[algorithm][sentence]
  92. @workers.each do |worker|
  93. next if !@stats_sentences[algorithm][sentence][worker]
  94. v_swu << @stats_sentences[algorithm][sentence][worker][:stats][:mean]
  95. t += 1
  96. end
  97. end
  98. v_swu = v_swu.var
  99. mi2 = mi.map { |v| v ** 2 }.sum
  100. nj2 = nj.map { |v| v ** 2 }.sum
  101. if v_su && v_wu
  102. m = Matrix[[ 1.0, 0.0, 1.0 ], [ 0.0, 1.0, 1.0 ], [ 1.0, 1.0, 1.0 ]]
  103. c = Matrix[[ v_su ], [ v_wu ], [ v_swu ]]
  104. v = m.inverse * c
  105. v_s = [ v[0,0], 0.0 ].max
  106. v_w = [ v[1,0], 0.0 ].max
  107. v_u = [ v[2,0], 0.0 ].max
  108. v_mu = v_s * mi2/(t ** 2) + v_w * nj2/(t ** 2) + v_u/t
  109. elsif !v_su && v_wu
  110. m = Matrix[[ 0.0, 1.0 ], [ 1.0, 1.0 ]]
  111. c = Matrix[[ v_wu ], [ v_swu ]]
  112. v = m.inverse * c
  113. v_s = [ v[0,0], 0.0 ].max
  114. v_wu = [ v[1,0], 0.0 ].max
  115. v_mu = v_s * mi2/(t ** 2) + v_wu/t
  116. elsif v_su && !v_wu
  117. m = Matrix[[ 0.0, 1.0 ], [ 1.0, 1.0 ]]
  118. c = Matrix[[ v_su ], [ v_swu ]]
  119. v = m.inverse * c
  120. v_w = [ v[0,0], 0.0 ].max
  121. v_su = [ v[1,0], 0.0 ].max
  122. v_mu = v_w * nj2/(t ** 2) + v_su/t
  123. else
  124. fail if v_su || v_wu
  125. v_mu = (v_swu == nil) ? 1e100 : v_swu/t
  126. end
  127. t = t95([ @sentences.length, @workers.length ].min - 1)
  128. @stats_sentences[algorithm][:stats][:ci] = t * Math.sqrt(v_mu)
  129. @stats_workers [algorithm][:stats][:ci] = t * Math.sqrt(v_mu)
  130. end
  131. end
  132. def init_stats
  133. @algorithms = {}
  134. @sentences = {}
  135. @workers = {}
  136. @stats_sentences = {}
  137. @stats_workers = {}
  138. @assignment_list.each do |a|
  139. answer = a.answer
  140. worker = a.workerId.to_sym
  141. answer.each_key do |algorithm|
  142. answer[algorithm].each_key do |sentence|
  143. @algorithms[algorithm] = 1
  144. @sentences[sentence] = 1
  145. @workers[worker] = 1
  146. # initialize @stats_sentences
  147. @stats_sentences[:values] ||= []
  148. @stats_sentences[:stats ] ||= {}
  149. @stats_sentences[algorithm] ||= {}
  150. @stats_sentences[algorithm][:values] ||= []
  151. @stats_sentences[algorithm][:stats ] ||= {}
  152. @stats_sentences[algorithm][sentence] ||= {}
  153. @stats_sentences[algorithm][sentence][:values] ||= []
  154. @stats_sentences[algorithm][sentence][:stats ] ||= {}
  155. @stats_sentences[algorithm][sentence][worker] ||= {}
  156. @stats_sentences[algorithm][sentence][worker][:values] ||= []
  157. @stats_sentences[algorithm][sentence][worker][:stats ] ||= {}
  158. @stats_sentences[algorithm][sentence][worker][:values] << answer[algorithm][sentence]
  159. # initialize @stats_workers
  160. @stats_workers[:values] ||= []
  161. @stats_workers[:stats ] ||= {}
  162. @stats_workers[algorithm] ||= {}
  163. @stats_workers[algorithm][:values] ||= []
  164. @stats_workers[algorithm][:stats ] ||= {}
  165. @stats_workers[algorithm][worker] ||= {}
  166. @stats_workers[algorithm][worker][:values] ||= []
  167. @stats_workers[algorithm][worker][:stats ] ||= {}
  168. @stats_workers[algorithm][worker][sentence] ||= {}
  169. @stats_workers[algorithm][worker][sentence][:values] ||= []
  170. @stats_workers[algorithm][worker][sentence][:stats ] ||= {}
  171. @stats_workers[algorithm][worker][sentence][:values] << answer[algorithm][sentence]
  172. end
  173. end
  174. end
  175. @algorithms = @algorithms.keys.sort do |a,b|
  176. ref_a = a.to_s.match(/^Ref/i)
  177. ref_b = b.to_s.match(/^Ref/i)
  178. anc_a = a.to_s.match(/^Anc/i)
  179. anc_b = b.to_s.match(/^Anc/i)
  180. if ref_a && ref_b
  181. 0
  182. elsif ref_a
  183. -1
  184. elsif ref_b
  185. 1
  186. elsif anc_a && anc_b
  187. 0
  188. elsif anc_a
  189. -1
  190. elsif anc_b
  191. 1
  192. else
  193. a.to_s <=> b.to_s
  194. end
  195. end
  196. @sentences = @sentences.keys.sort { |a,b| a.to_s <=> b.to_s }
  197. @workers = @workers.keys.sort { |a,b| a.to_s <=> b.to_s }
  198. end
  199. def build_stats(stats, key1, key2, key3)
  200. key1.each do |k1|
  201. next if !stats[k1]
  202. key2.each do |k2|
  203. next if !stats[k1][k2]
  204. key3.each do |k3|
  205. next if !stats[k1][k2][k3]
  206. stats_from_values(stats[k1][k2][k3])
  207. stats[k1][k2][:values] << stats[k1][k2][k3][:stats][:mean]
  208. stats[k1][:values] << stats[k1][k2][k3][:stats][:mean]
  209. end
  210. stats_from_values(stats[k1][k2])
  211. # stats[k1][:values] << stats[k1][k2][:stats][:mean]
  212. end
  213. stats_from_values(stats[k1])
  214. end
  215. return stats
  216. end
  217. def headphones(worker)
  218. assignment_list.each do |a|
  219. if a.workerId.to_sym == worker
  220. return a.headphones
  221. end
  222. end
  223. return ''
  224. end
  225. def compute_bonuses(min_assignments, min_working_time, bonus_quantity, bonus_quality_50pct, bonus_quality_10pct)
  226. bonus = {}
  227. del_list = {}
  228. @workers.each do |worker|
  229. bonus[worker] ||= {}
  230. bonus[worker][:amount] ||= 0.0
  231. bonus[worker][:reason] = ''
  232. end
  233. @workers.each do |worker|
  234. if assignment_count(worker) >= min_assignments
  235. bonus[worker][:amount] += bonus_quantity
  236. bonus[worker][:reason] = "At least #{min_assignments} HITs completed"
  237. else
  238. del_list[worker] = 1
  239. end
  240. end
  241. @assignment_list.delete_if { |a| del_list[a.workerId.to_sym] }
  242. rebuild_stats
  243. correlation = []
  244. @workers.each do |worker|
  245. entry = {}
  246. entry[:worker] = worker
  247. entry[:correlation] = worker_correlation(worker)
  248. correlation << entry
  249. end
  250. correlation.sort! { |x,y| y[:correlation] <=> x[:correlation] }
  251. (0..((0.5 * correlation.length).round - 1)).each do |i|
  252. worker = correlation[i][:worker]
  253. bonus[worker][:amount] += bonus_quality_50pct
  254. bonus[worker][:reason] = "At least #{min_assignments} HITs completed; set in the top 50%"
  255. end
  256. (0..((0.1 * correlation.length).round - 1)).each do |i|
  257. worker = correlation[i][:worker]
  258. bonus[worker][:amount] += bonus_quality_10pct
  259. bonus[worker][:reason] = "At least #{min_assignments} HITs completed; set in the top 10%"
  260. end
  261. bonus
  262. end
  263. def fast_workers(min_working_time)
  264. worker_list = {}
  265. @assignment_list.each { |a| worker_list[a.workerId.to_sym] = 1 if a.workingTime < min_working_time }
  266. worker_list
  267. end
  268. def bad_workers(min_assignments)
  269. worker_list = {}
  270. @workers.each do |worker|
  271. if worker_correlation(worker) < 0.25 #&& assignment_count(worker) >= min_assignments
  272. worker_list[worker] = 1
  273. end
  274. end
  275. worker_list
  276. end
  277. def outliers(min_assignments)
  278. worker_list = {}
  279. @workers.each do |worker|
  280. # rw, rref = range(worker)
  281. # worker_list[worker] = 1 if (rref > 0.3) && (rw < 0.5 * rref)
  282. worker_list[worker] = 1 if worker_correlation(worker) < 0.7
  283. worker_list[worker] = 1 if assignment_count(worker) < min_assignments
  284. end
  285. worker_list
  286. end
  287. def print_worker_stats(workers = @workers)
  288. workers.each do |worker|
  289. printf("Stats for Worker %s\n", worker)
  290. printf("%-15s %-15s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
  291. 'Sentence', 'Algorithm', 'Count', 'Min', 'Max', 'Mean', 'StdDev', 'Kurt', 'CI95')
  292. @algorithms.each do |algorithm|
  293. if !@stats_workers[algorithm] || !@stats_workers[algorithm][worker]
  294. next
  295. end
  296. s = @stats_workers[algorithm][worker][:stats]
  297. printf("%-15s %-15s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
  298. '--',
  299. algorithm,
  300. s[:count] ? sprintf("%d", s[:count] ) : '---',
  301. s[:min] ? sprintf("%.1f", s[:min] ) : '---',
  302. s[:max] ? sprintf("%.1f", s[:max] ) : '---',
  303. s[:mean] ? sprintf("%.2f", s[:mean] ) : '----',
  304. s[:std_dev] ? sprintf("%.2f", s[:std_dev] ) : '----',
  305. s[:kurtosis] ? sprintf("%.2f", s[:kurtosis]) : '----',
  306. s[:ci] ? sprintf("%.2f", s[:ci] ) : '----')
  307. end
  308. printf("\n")
  309. printf("%-15s %-15s %-10s %-10s\n",
  310. 'Sentence', 'Algorithm', 'Count', 'Mean')
  311. @sentences.each do |sentence|
  312. @algorithms.each do |algorithm|
  313. if !@stats_workers[algorithm] ||
  314. !@stats_workers[algorithm][worker] ||
  315. !@stats_workers[algorithm][worker][sentence]
  316. next
  317. end
  318. s = @stats_workers[algorithm][worker][sentence][:stats]
  319. printf("%-15s %-15s %-10s %-10s\n",
  320. sentence,
  321. algorithm,
  322. s[:count] ? sprintf("%d", s[:count] ) : '---',
  323. s[:mean] ? sprintf("%.2f", s[:mean] ) : '----')
  324. end
  325. end
  326. printf("\n")
  327. printf("Audio setup: %s\n", headphones(worker))
  328. printf("Number of assignments: %d\n", assignment_count(worker))
  329. printf("Mean working time: %d\n", mean_working_time(worker))
  330. printf("Correlation: %4.2f\n", worker_correlation(worker))
  331. printf("\n")
  332. end
  333. end
  334. def print_sentence_stats
  335. printf("Global Stats\n")
  336. printf("%-15s %-15s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
  337. 'Sentence', 'Algorithm', 'Count', 'Min', 'Max', 'Mean', 'StdDev', 'Kurt', 'CI95')
  338. @sentences.each do |sentence|
  339. @algorithms.each do |algorithm|
  340. if !@stats_sentences[algorithm] || !@stats_sentences[algorithm][sentence]
  341. next
  342. end
  343. s = @stats_sentences[algorithm][sentence][:stats]
  344. printf("%-15s %-15s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
  345. sentence,
  346. algorithm,
  347. s[:count] ? sprintf("%d", s[:count] ) : '---',
  348. s[:min] ? sprintf("%.1f", s[:min] ) : '---',
  349. s[:max] ? sprintf("%.1f", s[:max] ) : '---',
  350. s[:mean] ? sprintf("%.2f", s[:mean] ) : '----',
  351. s[:std_dev] ? sprintf("%.2f", s[:std_dev] ) : '----',
  352. s[:kurtosis] ? sprintf("%.2f", s[:kurtosis]) : '----',
  353. s[:ci] ? sprintf("%.2f", s[:ci] ) : '----')
  354. end
  355. end
  356. printf("\n")
  357. end
  358. def print_algorithm_stats
  359. printf("Global Stats, using %d assignments, %d subjects\n", assignment_list.length, @workers.length)
  360. printf("%-15s %-15s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
  361. 'Sentence', 'Algorithm', 'Count', 'Min', 'Max', 'Mean', 'StdDev', 'Kurt', 'CI95')
  362. @algorithms.each do |algorithm|
  363. next if !@stats_sentences[algorithm]
  364. s = @stats_sentences[algorithm][:stats]
  365. printf("%-15s %-15s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
  366. '--',
  367. algorithm,
  368. s[:count] ? sprintf("%d", s[:count] ) : '---',
  369. s[:min] ? sprintf("%.1f", s[:min] ) : '---',
  370. s[:max] ? sprintf("%.1f", s[:max] ) : '---',
  371. s[:mean] ? sprintf("%.2f", s[:mean] ) : '----',
  372. s[:std_dev] ? sprintf("%.2f", s[:std_dev] ) : '----',
  373. s[:kurtosis] ? sprintf("%.2f", s[:kurtosis]) : '----',
  374. s[:ci] ? sprintf("%.2f", s[:ci] ) : '----')
  375. end
  376. printf("\n")
  377. end
  378. def assignment_count(worker)
  379. count = 0
  380. @assignment_list.each do |a|
  381. if a.workerId.to_sym == worker
  382. count += 1
  383. end
  384. end
  385. count
  386. end
  387. # consistency test according to ITU-R BT.500
  388. def consistent?(worker)
  389. p = 0
  390. q = 0
  391. n = 0
  392. @sentences.each do |sentence|
  393. @algorithms.each do |algorithm|
  394. if !@stats_sentences[algorithm] ||
  395. !@stats_sentences[algorithm][sentence] ||
  396. !@stats_sentences[algorithm][sentence][worker]
  397. next
  398. end
  399. s = @stats_sentences[algorithm][sentence][worker][:stats]
  400. r = @stats_sentences[algorithm][sentence][:stats]
  401. if (r[:kurtosis] - 3.0).abs <= 1.0
  402. if s[:mean] > r[:mean] + 2.0 * r[:std_dev]
  403. p += 1
  404. end
  405. if s[:mean] < r[:mean] - 2.0 * r[:std_dev]
  406. q += 1
  407. end
  408. else
  409. if s[:mean] > r[:mean] + Math.sqrt(20.0) * r[:std_dev]
  410. p += 1
  411. end
  412. if s[:mean] < r[:mean] - Math.sqrt(20.0) * r[:std_dev]
  413. q += 1
  414. end
  415. end
  416. n += 1
  417. end
  418. end
  419. if (p+q) > 0.0 && (p+q) / n > 0.05 && (p-q).abs / (p+q) < 0.3
  420. return false
  421. else
  422. return true
  423. end
  424. end
  425. def range(worker)
  426. v_mean = []
  427. v_work = []
  428. @algorithms.each do |algorithm|
  429. next if !@stats_workers[algorithm] || !@stats_workers[algorithm][worker]
  430. v_work << @stats_workers[algorithm][worker][:stats][:mean]
  431. v_mean << @stats_workers[algorithm][:stats][:mean]
  432. end
  433. work_range = v_work.max - v_work.min
  434. mean_range = v_mean.max - v_mean.min
  435. return work_range, mean_range
  436. end
  437. def worker_correlation(worker)
  438. a_correlation = algorithm_mos_correlation(worker)
  439. s_correlation = sentence_mos_correlation(worker)
  440. fail unless a_correlation || s_correlation
  441. return a_correlation if a_correlation
  442. return s_correlation if s_correlation
  443. end
  444. def sentence_mos_correlation(worker)
  445. v1 = []
  446. v2 = []
  447. @sentences.each do |sentence|
  448. @algorithms.each do |algorithm|
  449. if !@stats_sentences[algorithm] ||
  450. !@stats_sentences[algorithm][sentence] ||
  451. !@stats_sentences[algorithm][sentence][worker]
  452. next
  453. end
  454. v1 << @stats_sentences[algorithm][sentence][:stats][:mean]
  455. v2 << @stats_sentences[algorithm][sentence][worker][:stats][:mean]
  456. end
  457. end
  458. Array::correlation_coefficient(v1, v2)
  459. end
  460. # used for finding outliers and computing bonuses
  461. def algorithm_mos_correlation(worker)
  462. v1 = []
  463. v2 = []
  464. @algorithms.each do |algorithm|
  465. if !@stats_workers[algorithm] ||
  466. !@stats_workers[algorithm][worker]
  467. next
  468. end
  469. v1 << @stats_workers[algorithm][:stats][:mean]
  470. v2 << @stats_workers[algorithm][worker][:stats][:mean]
  471. end
  472. Array::correlation_coefficient(v1, v2)
  473. end
  474. def mean_working_time(worker = nil)
  475. if worker
  476. @assignment_list.find_all {|a| a.workerId.to_sym == worker }.map { |a| a.workingTime }.mean
  477. else
  478. @assignment_list.map { |a| a.workingTime }.mean
  479. end
  480. end
  481. def print_raw_scores
  482. datafile = File.new("MATLAB/algorithms.txt", "w")
  483. datafile.puts @algorithms.join("\t")
  484. datafile.close
  485. datafile = File.new("MATLAB/sentences.txt", "w")
  486. datafile.puts @sentences.join("\t")
  487. datafile.close
  488. @algorithms.each do |algorithm|
  489. filename = sprintf("MATLAB/scores_%s.txt", algorithm)
  490. datafile = File.new(filename, "w")
  491. @workers.each do |worker|
  492. v = []
  493. @sentences.each do |sentence|
  494. if @stats_sentences[algorithm][sentence] &&
  495. @stats_sentences[algorithm][sentence][worker] &&
  496. @stats_sentences[algorithm][sentence][worker][:stats][:mean]
  497. v << @stats_sentences[algorithm][sentence][worker][:stats][:mean]
  498. else
  499. v << -1.0
  500. end
  501. end
  502. datafile.puts v.map { |score| sprintf('%.3f', score) }.join("\t")
  503. end
  504. datafile.close
  505. end
  506. end
  507. end # AssignmentSet
  508. end # MOS