PageRenderTime 47ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/app/models/automated_metareview/text_preprocessing.rb

https://github.com/danrosshoward/expertiza
Ruby | 342 lines | 314 code | 5 blank | 23 comment | 14 complexity | 2e7deaef86830789f50a6722389f313e MD5 | raw file
Possible License(s): GPL-2.0
  1. require 'automated_metareview/constants'
  2. require 'automated_metareview/edge'
  3. require 'automated_metareview/vertex'
  4. class TextPreprocessing
  5. =begin
  6. Fetching review data from the tables based on the response_map id
  7. =end
  8. def fetch_review_data(auto_metareview, map_id)
  9. reviews = Array.new
  10. responses = Response.find(:first, :conditions => ["map_id = ?", map_id], :order => "updated_at DESC")
  11. auto_metareview.responses = responses
  12. auto_metareview.response_id = responses.id
  13. # puts "auto_metareview.response_id #{auto_metareview.response_id}"
  14. # puts "responses updated_at #{responses.updated_at}"
  15. responses.scores.each{
  16. | review_score |
  17. if(review_score.comments != nil and !review_score.comments.rstrip.empty?)
  18. # puts review_score.comments
  19. reviews << review_score.comments
  20. end
  21. }
  22. return reviews
  23. end
  24. #------------------------------------------#------------------------------------------#------------------------------------------
  25. =begin
  26. Fetching submission data from the url submitted by the reviewee
  27. =end
  28. def fetch_submission_data(map_id)
  29. subm_array = Array.new
  30. response_map = ResponseMap.find(:first, :conditions => ["id = ?", map_id])
  31. reviewee_id = response_map.reviewee_id
  32. reviewed_object = response_map.reviewed_object_id
  33. url = Participant.find(:first, :conditions => ["id = ?", reviewee_id]).submitted_hyperlinks
  34. if(url.nil?)#in case of team assignments
  35. teams_users = TeamsUser.find(:all, :conditions => ["team_id = ?", reviewee_id])
  36. teams_users.each{
  37. |team_user|
  38. url = Participant.find(:first, :conditions => ["user_id = ? and parent_id = ?", team_user.user_id, reviewed_object]).submitted_hyperlinks
  39. if(!url.nil?)#break out when you find the url
  40. break
  41. end
  42. }
  43. end
  44. # puts "***url #{url} #{url}"
  45. #fetching the url submitted by the reviewee
  46. url = url[url.rindex("http")..url.length-2] #use "rindex" to fetch last occurrence of the substring - useful if there are multiple urls
  47. # puts "***url #{url} #{url.class}"
  48. page = Nokogiri::HTML(open(url))
  49. #fetching the paragraph texts from the specified url
  50. if(page.css('p').count != 0)
  51. page.css('p').each do |subm|
  52. # puts "subm.text.. #{subm.text}"
  53. subm_array << subm.text
  54. end
  55. end
  56. #for google docs where the text is placed inside <script></script> tags
  57. if(page.css('script').count != 0)
  58. page.css('script').each do |subm|
  59. if(!subm.children[0].to_s.index("\"s\":\"").nil? and !subm.children[0].to_s.index("\\n\"},").nil?) #the string indicates the beginning of the text in the script
  60. subm_array << subm.children[0].to_s[subm.children[0].to_s.index("\"s\":\"")+5, subm.children[0].to_s.index("\\n\"},")]
  61. end
  62. end
  63. end
  64. return subm_array
  65. end
  66. #------------------------------------------#------------------------------------------#------------------------------------------
  67. =begin
  68. pre-processes the review text and sends it in for graph formation and further analysis
  69. =end
  70. def segment_text(flag, text_array)
  71. if(flag == 0)
  72. reviews = Array.new(1){Array.new}
  73. else
  74. reviews = Array.new(50){Array.new} #50 is the number of different reviews/submissions
  75. end
  76. i = 0
  77. j = 0
  78. for k in (0..text_array.length-1)
  79. text = text_array[k]
  80. if(flag == 1) #reset i (the sentence counter) to 0 for test reviews
  81. reviews[j] = Array.new #initializing the array for sentences in a test review
  82. i = 0
  83. end
  84. #******* Pre-processing the review/submission text **********
  85. #replacing commas in large numbers, makes parsing sentences with commas confusing!
  86. #replacing quotation marks
  87. text.gsub!("\"", "")
  88. text.gsub!("(", "")
  89. text.gsub!(")", "")
  90. if(text.include?("http://"))
  91. text = remove_urls(text)
  92. end
  93. #break the text into multiple sentences
  94. beginn = 0
  95. if(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";") ) #new clause or sentence
  96. while(text.include?(".") or text.include?("?") or text.include?("!") or text.include?(",") or text.include?(";")) do #the text contains more than 1 sentence
  97. endd = 0
  98. #these 'if' conditions have to be independent, cause the value of 'endd' could change for the different types of punctuations
  99. if(text.include?("."))
  100. endd = text.index(".")
  101. end
  102. if((text.include?("?") and endd != 0 and endd > text.index("?")) or (text.include?("?") and endd == 0))#if a ? occurs before a .
  103. endd = text.index("?")
  104. end
  105. if((text.include?("!") and endd!= 0 and endd > text.index("!")) or (text.include?("!") and endd ==0))#if an ! occurs before a . or a ?
  106. endd = text.index("!")
  107. end
  108. if((text.include?(",") and endd != 0 and endd > text.index(",")) or (text.include?(",") and endd == 0)) #if a , occurs before any of . or ? or !
  109. endd = text.index(",")
  110. end
  111. if((text.include?(";") and endd != 0 and endd > text.index(";")) or (text.include?(";") and endd == 0)) #if a ; occurs before any of . or ?, ! or ,
  112. endd = text.index(";")
  113. end
  114. #check if the string between two commas or punctuations is there to buy time e.g. ", say," ",however," ", for instance, "...
  115. if(flag == 0) #training
  116. reviews[0][i] = text[beginn..endd].strip
  117. else #testing
  118. reviews[j][i] = text[beginn..endd].strip
  119. end
  120. i+=1 #incrementing the sentence counter
  121. text = text[(endd+1)..text.length] #from end+1 to the end of the string variable
  122. end #end of the while loop
  123. else #if there is only 1 sentence in the text
  124. if(flag == 0)#training
  125. reviews[0][i] = text.strip
  126. i+=1 #incrementing the sentence counter
  127. else #testing
  128. reviews[j][i] = text.strip
  129. end
  130. end
  131. if(flag == 1)#incrementing reviews counter only for test reviews
  132. j+=1
  133. end
  134. end #end of the for loop with 'k' reading text rows
  135. #setting the number of reviews before returning
  136. if(flag == 0)#training
  137. num_reviews = 1 #for training the number of reviews is 1
  138. else #testing
  139. num_reviews = j
  140. end
  141. if(flag == 0)
  142. return reviews[0]
  143. end
  144. end
  145. #------------------------------------------#------------------------------------------#------------------------------------------
  146. =begin
  147. * Reads the patterns from the csv file containing them.
  148. * maxValue is the maximum value of the patterns found
  149. =end
  150. def read_patterns(filename, pos)
  151. num = 1000 #some large number
  152. patterns = Array.new
  153. state = POSITIVE
  154. i = 0 #keeps track of the number of edges
  155. #setting the state for problem detection and suggestive patterns
  156. if(filename.include?("prob"))
  157. state = NEGATED
  158. elsif(filename.include?("suggest"))
  159. state = SUGGESTIVE
  160. end
  161. FasterCSV.foreach(filename) do |text|
  162. in_vertex = text[0][0..text[0].index("=")-1].strip
  163. out_vertex = text[0][text[0].index("=")+2..text[0].length].strip
  164. first_string_in_vertex = pos.get_readable(in_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
  165. first_string_out_vertex = pos.get_readable(out_vertex.split(" ")[0]) #getting the first token in vertex to determine POS
  166. patterns[i] = Edge.new("noun", NOUN)
  167. #setting the invertex
  168. if(first_string_in_vertex.include?("/NN") or first_string_in_vertex.include?("/PRP") or first_string_in_vertex.include?("/IN") or first_string_in_vertex.include?("/EX") or first_string_in_vertex.include?("/WP"))
  169. patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
  170. elsif(first_string_in_vertex.include?("/VB") or first_string_in_vertex.include?("MD"))
  171. patterns[i].in_vertex = Vertex.new(in_vertex, VERB, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
  172. elsif(first_string_in_vertex.include?("JJ"))
  173. patterns[i].in_vertex = Vertex.new(in_vertex, ADJ, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
  174. elsif(first_string_in_vertex.include?("/RB"))
  175. patterns[i].in_vertex = Vertex.new(in_vertex, ADV, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
  176. else #default to noun
  177. patterns[i].in_vertex = Vertex.new(in_vertex, NOUN, i, state, nil, nil, first_string_in_vertex[first_string_in_vertex.index("/")+1..first_string_in_vertex.length])
  178. end
  179. #setting outvertex
  180. if(first_string_out_vertex.include?("/NN") or first_string_out_vertex.include?("/PRP") or first_string_out_vertex.include?("/IN") or first_string_out_vertex.include?("/EX") or first_string_out_vertex.include?("/WP"))
  181. patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
  182. elsif(first_string_out_vertex.include?("/VB") or first_string_out_vertex.include?("MD"))
  183. patterns[i].out_vertex = Vertex.new(out_vertex, VERB, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
  184. elsif(first_string_out_vertex.include?("JJ"))
  185. patterns[i].out_vertex = Vertex.new(out_vertex, ADJ, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length-1]);
  186. elsif(first_string_out_vertex.include?("/RB"))
  187. patterns[i].out_vertex = Vertex.new(out_vertex, ADV, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
  188. else #default is noun
  189. patterns[i].out_vertex = Vertex.new(out_vertex, NOUN, i, state, nil, nil, first_string_out_vertex[first_string_out_vertex.index("/")+1..first_string_out_vertex.length])
  190. end
  191. i+=1 #incrementing for each pattern
  192. end #end of the FasterCSV.foreach loop
  193. num_patterns = i
  194. return patterns
  195. end
  196. #------------------------------------------#------------------------------------------#------------------------------------------
  197. =begin
  198. Removes any urls in the text and returns the remaining text as it is
  199. =end
  200. def remove_urls(text)
  201. final_text = String.new
  202. if(text.include?("http://"))
  203. tokens = text.split(" ")
  204. tokens.each{
  205. |token|
  206. if(!token.include?("http://"))
  207. final_text = final_text + " " + token
  208. end
  209. }
  210. else
  211. return text
  212. end
  213. return final_text
  214. end
  215. #------------------------------------------#------------------------------------------#------------------------------------------
  216. =begin
  217. Check for plagiarism after removing text within quotes for reviews
  218. =end
  219. def remove_text_within_quotes(review_text)
  220. # puts "Inside removeTextWithinQuotes:: "
  221. reviews = Array.new
  222. review_text.each{ |row|
  223. # puts "row #{row}"
  224. text = row
  225. #text = text[1..text.length-2] #since the first and last characters are quotes
  226. #puts "text #{text}"
  227. #the read text is tagged with two sets of quotes!
  228. if(text.include?("\""))
  229. while(text.include?("\"")) do
  230. replace_text = text.scan(/"([^"]*)"/)
  231. # puts "replace_text #{replace_text[0]}.. #{replace_text[0].to_s.class} .. #{replace_text.length}"
  232. # puts text.index(replace_text[0].to_s)
  233. # puts "replace_text length .. #{replace_text[0].to_s.length}"
  234. #fetching the start index of the quoted text, in order to replace the complete segment
  235. start_index = text.index(replace_text[0].to_s) - 1 #-1 in order to start from the quote
  236. # puts "text[start_index..start_index + replace_text[0].to_s.length+1] .. #{text[start_index.. start_index + replace_text[0].to_s.length+1]}"
  237. #replacing the text segment within the quotes (including the quotes) with an empty string
  238. text.gsub!(text[start_index..start_index + replace_text[0].to_s.length+1], "")
  239. # puts "text .. #{text}"
  240. end #end of the while loop
  241. end
  242. reviews << text #set the text after all quoted segments have been removed.
  243. } #end of the loop for "text" array
  244. # puts "returning reviews length .. #{reviews.length}"
  245. return reviews #return only the first array element - a string!
  246. end
  247. #------------------------------------------#------------------------------------------#------------------------------------------
  248. =begin
  249. Looks for spelling mistakes in the text and fixes them using the raspell library available for ruby
  250. =end
  251. def check_correct_spellings(review_text_array, speller)
  252. review_text_array_temp = Array.new
  253. #iterating through each response
  254. review_text_array.each{
  255. |review_text|
  256. review_tokens = review_text.split(" ")
  257. review_text_temp = ""
  258. #iterating through tokens from each response
  259. review_tokens.each{
  260. |review_tok|
  261. #checkiing the stem word's spelling for correctness
  262. if(!speller.check(review_tok))
  263. if(!speller.suggest(review_tok).first.nil?)
  264. review_tok = speller.suggest(review_tok).first
  265. end
  266. end
  267. review_text_temp = review_text_temp +" " + review_tok.downcase
  268. }
  269. review_text_array_temp << review_text_temp
  270. }
  271. return review_text_array_temp
  272. end
  273. #------------------------------------------#------------------------------------------#------------------------------------------
  274. =begin
  275. Checking if "str" is a punctuation mark like ".", ",", "?" etc.
  276. =end
  277. public #The method was throwing a "NoMethodError: private method" error when called from a different class. Hence the "public" keyword.
  278. def contains_punct(str)
  279. if(str.include?".")
  280. str.gsub!(".","")
  281. elsif(str.include?",")
  282. str.gsub!(",","")
  283. elsif(str.include?"?")
  284. str.gsub!("?","")
  285. elsif(str.include?"!")
  286. str.gsub!("!","")
  287. elsif(str.include?";")
  288. str.gsub(";","")
  289. elsif(str.include?":")
  290. str.gsub!(":","")
  291. elsif(str.include?"(")
  292. str.gsub!("(","")
  293. elsif(str.include?")")
  294. str.gsub!(")","")
  295. elsif(str.include?"[")
  296. str.gsub!("[","")
  297. elsif(str.include?"]")
  298. str.gsub!("]","")
  299. end
  300. return str
  301. end
  302. def contains_punct_bool(str)
  303. if(str.include?("\\n") or str.include?("}") or str.include?("{"))
  304. return true
  305. else
  306. return false
  307. end
  308. end
  309. #------------------------------------------#------------------------------------------#------------------------------------------
  310. =begin
  311. Checking if "str" is a punctuation mark like ".", ",", "?" etc.
  312. =end
  313. def is_punct(str)
  314. if(str == "." or str == "," or str == "?" or str == "!" or str == ";" or str == ":")
  315. return true
  316. else
  317. return false
  318. end
  319. end
  320. end #end of class