PageRenderTime 27ms CodeModel.GetById 41ms RepoModel.GetById 0ms app.codeStats 0ms

/mod/metadatalom/class.autokeyword.php

https://github.com/galitush2005/RTL-BIDI-Hebrew-Moodle-Plugins
PHP | 249 lines | 118 code | 30 blank | 101 comment | 12 complexity | 51e4ea32be3be5981a3c41a236431706 MD5 | raw file
  1. <?php
  2. /*
  3. Class updated by Vitor Gonçalves, August 2006.
  4. contact: vg_AT_ipb.pt
  5. Changes: Integration with field keywords in module Metadata for Moodle
  6. Original Projectname: Automatic Keyword Generator
  7. Version: 0.3
  8. Author: Ver Pangonilo <smp_AT_itsp.info>
  9. Last modified: 26 July 2006
  10. Copyright (C): 2006 Ver Pangonilo, All Rights Reserved
  11. * GNU General Public License (Version 2, June 1991)
  12. *
  13. * This program is free software; you can redistribute
  14. * it and/or modify it under the terms of the GNU
  15. * General Public License as published by the Free
  16. * Software Foundation; either version 2 of the License,
  17. * or (at your option) any later version.
  18. *
  19. * This program is distributed in the hope that it will
  20. * be useful, but WITHOUT ANY WARRANTY; without even the
  21. * implied warranty of MERCHANTABILITY or FITNESS FOR A
  22. * PARTICULAR PURPOSE. See the GNU General Public License
  23. * for more details.
  24. Description:
  25. This class can generates automatically META Keywords for your
  26. web pages based on the contents of your articles. This will
  27. eliminate the tedious process of thinking what will be the best
  28. keywords that suits your article. The basis of the keyword
  29. generation is the number of iterations any word or phrase
  30. occured within an article.
  31. This automatic keyword generator will create single words,
  32. two word phrase and three word phrases. Single words will be
  33. filtered from a common words list.
  34. Change Log:
  35. ===========
  36. 0.2 Ver Pangonilo - 22 July 2005
  37. ================================
  38. Added user configurable parameters and commented codes
  39. for easier end user understanding.
  40. 0.3 Vasilich (vasilich_AT_grafin.kiev.ua) - 26 July 2006
  41. =========================================================
  42. Added encoding parameter to work with UTF texts, min number
  43. of the word/phrase occurrences,
  44. ******************************************************************/
  45. class autokeyword {
  46. //declare variables
  47. //the site contents
  48. var $contents;
  49. var $encoding;
  50. //the generated keywords
  51. var $keywords;
  52. //minimum word length for inclusion into the single word
  53. //metakeys
  54. var $wordLengthMin;
  55. var $wordOccuredMin;
  56. //minimum word length for inclusion into the 2 word
  57. //phrase metakeys
  58. var $word2WordPhraseLengthMin;
  59. var $phrase2WordLengthMinOccur;
  60. //minimum word length for inclusion into the 3 word
  61. //phrase metakeys
  62. var $word3WordPhraseLengthMin;
  63. //minimum phrase length for inclusion into the 2 word
  64. //phrase metakeys
  65. var $phrase2WordLengthMin;
  66. var $phrase3WordLengthMinOccur;
  67. //minimum phrase length for inclusion into the 3 word
  68. //phrase metakeys
  69. var $phrase3WordLengthMin;
  70. function autokeyword($params, $encoding)
  71. {
  72. //get parameters
  73. $this->encoding = $encoding;
  74. mb_internal_encoding($encoding);
  75. $this->contents = $this->replace_chars($params['content']);
  76. // single word
  77. $this->wordLengthMin = $params['min_word_length'];
  78. $this->wordOccuredMin = $params['min_word_occur'];
  79. // 2 word phrase
  80. $this->word2WordPhraseLengthMin = $params['min_2words_length'];
  81. $this->phrase2WordLengthMin = $params['min_2words_phrase_length'];
  82. $this->phrase2WordLengthMinOccur = $params['min_2words_phrase_occur'];
  83. // 3 word phrase
  84. $this->word3WordPhraseLengthMin = $params['min_3words_length'];
  85. $this->phrase3WordLengthMin = $params['min_3words_phrase_length'];
  86. $this->phrase3WordLengthMinOccur = $params['min_3words_phrase_occur'];
  87. //parse single, two words and three words
  88. }
  89. function get_keywords()
  90. {
  91. $keywords = $this->parse_words().$this->parse_2words().$this->parse_3words();
  92. return substr($keywords, 0, -2);
  93. }
  94. //turn the site contents into an array
  95. //then replace common html tags.
  96. function replace_chars($content)
  97. {
  98. //convert all characters to lower case
  99. //$content = mb_strtolower($content);
  100. $content = mb_strtolower($content, "UTF-8");
  101. $content = strip_tags($content);
  102. $punctuations = array(',', ')', '(', '.', "'", '"',
  103. '<', '>', ';', '!', '?', '/', '-',
  104. '_', '[', ']', ':', '+', '=', '#',
  105. '$', '&quot;', '&copy;', '&gt;', '&lt;',
  106. chr(10), chr(13), chr(9));
  107. $content = str_replace($punctuations, " ", $content);
  108. // replace multiple gaps
  109. $content = preg_replace('/ {2,}/si', " ", $content);
  110. return $content;
  111. }
  112. //single words META KEYWORDS
  113. function parse_words()
  114. {
  115. //list of commonly used words
  116. // this can be edited to suit your needs
  117. $common = array("e", "apenas","permite","permitir","está","todo","todos","neste","nesta","capaz", "acima", "actua", "soma", "medo", "após", "depois", "outra", "outro", "outras", "outros", "podem", "vez", "contra", "idade", "há", "concorda", "tudo", "quase", "sozinho", "só", "longo", "de", "da", "do", "já", "agora", "também", "embora", "contudo", "entanto", "sempre", "meio", "quantidade", "olá", "e", "raiva", "irritado", "resposta", "alguns", "aparece", "questão", "é", "chega", "braço", "braços", "através", "chega", "como", "pedir", "em", "onde", "tentativa", "tia", "ausente", "talvez", "mau", "saco", "uma", "esteja", "se", "porque", "esse", "sido", "antes", "começou", "começa", "atrás", "sendo", "dessa", "desse", "pertence", "abaixo", "lado", "mais", "ao", "à", "às", "ás", "melhor", "pior", "entre", "além", "grande","corpo", "osso", "carregado","pede", "ambos", "fundo", "caixa", "menino", "ruptura", "traz", "trazido", "erro", "construído", "ocupado", "mas", "compra", "por", "chamada", "veio", "pode", "causa", "escolhe", "próximo", "anterior", "considera", "vindo", "considerável", "contêm", "continuam", "poderia", "grito", "corte", "desafio", "pelo", "pela", "caro", "barato", "profundo", "sim", "dado", "não", "pois", "cão", "feito", "dúvida", "baixo", "durante", "cada", "um", "uma", "come", "qualquer", "alguma", "que", "a", "o", "extremidade", "aprecia", "bastante", "entrada", "mesmo", "excepto", "espera", "explica", "falha", "queda", "distante", "gordura", "favor", "sensação", "pés", "caiu", "sentido", "poucos", "muitos", "achado", "mosca", "segue", "para", "esquece-se", "parte", "dianteira", "deu", "começa", "dá", "vai", "ido", "bom", "começado", "cinza", "enorme", "verde", "cresceu", "cresce", "suposição", "teve", "metade", "cair", "acontece", "tem", "chapéu", "têm", "ele", "ela", "eu", "tu", "ouve", "ouvido", "preso", "conseguinte", "ajuda", "aqui", "dela", "elevado", "monte", "dele", "dela", "batida", "porquê", "quente", "frio", "entretanto", "aquando", "se", "mal", "bem", "certamente", "preferivelmente", "certo", "errado", "fosse", "essas", "seu", "sua", "justo", "suas", "manter", "soube", "sabe", "sabido", "atrasado", "menos", "conduzir", "sair", "emprestar", "menos", "deixar", "comer", "provável", "gosto", "solitário", "longo", "olhar", "lote", "fazem", "enormes", "ora", "mim", "ainda", "encontrado", "opss", "milha", "mina", "lua", "sol", "movimento", "nunca", "meu", "minha", "perto", "quase", "necessário", "nem", "nunca", "seguinte", "nenhum", "nenhuns", "nem", "nota", "nada", "número", "frequentemente", " ah", "ps", "somente", "ou", "principalmente", "nossa", "nosso", "fora", "por favor", "preparam", "provável", "puro", "funcionaram", "alcance", "realizam", "requerem", "descanso", "disse", "sentado", "palavra","parece", "visto", "emitido", "separado", "deve", "desde", "assim", "vendido", "alguns", "logo", "pesaroso", "etapa", "vara", "tais", "tal", "supõe", "tomada", "feito", "conversa", "alto", "diz", "dez", "agradecer", "obrigado", "eles", "então", "lá", "consequentemente", "estes", "eles", "isto", "aqueles", "aquilo", "até", "aquelas", "estas", "hoje", "amanhã", "demasiado", "rasgou", "volta", "dois", "sob", "sobre", "nós", "vós", "uso", "usual", "vários", "diversos", "visita", "querem", "era", "fomos", "éramos", "quanto", "quantos", "onde", "quantos", "quantas", "branco", "quem", "quais", "qual", "cujo", "com", "dentro", "sem", "com", "tanto", "você", "cópia", "able", "about", "above", "act", "add", "afraid", "after", "again", "against", "age", "ago", "agree", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "amount", "an", "and", "anger", "angry", "animal", "another", "answer", "any", "appear", "apple", "are", "arrive", "arm", "arms", "around", "arrive", "as", "ask", "at", "attempt", "aunt", "away", "back", "bad", "bag", "bay", "be", "became", "because", "become", "been", "before", "began", "begin", "behind", "being", "bell", "belong", "below", "beside", "best", "better", "between", "beyond", "big", "body", "bone", "born", "borrow", "both", "bottom", "box", "boy", "break", "bring", "brought", "bug", "built", "busy", "but", "buy", "by", "call", "came", "can", "cause", "choose", "close", "close", "consider", "come", "consider", "considerable", "contain", "continue", "could", "cry", "cut", "dare", "dark", "deal", "dear", "decide", "deep", "did", "die", "do", "does", "dog", "done", "doubt", "down", "during", "each", "ear", "early", "eat", "effort", "either", "else", "end", "enjoy", "enough", "enter", "even", "ever", "every", "except", "expect", "explain", "fail", "fall", "far", "fat", "favor", "fear", "feel", "feet", "fell", "felt", "few", "fill", "find", "fit", "fly", "follow", "for", "forever", "forget", "from", "front", "gave", "get", "gives", "goes", "gone", "good", "got", "gray", "great", "green", "grew", "grow", "guess", "had", "half", "hang", "happen", "has", "hat", "have", "he", "hear", "heard", "held", "hello", "help", "her", "here", "hers", "high", "hill", "him", "his", "hit", "hold", "hot", "how", "however", "I", "if", "ill", "in", "indeed", "instead", "into", "iron", "is", "it", "its", "just", "keep", "kept", "knew", "know", "known", "late", "least", "led", "left", "lend", "less", "let", "like", "likely", "likr", "lone", "long", "look", "lot", "make", "many", "may", "me", "mean", "met", "might", "mile", "mine", "moon", "more", "most", "move", "much", "must", "my", "near", "nearly", "necessary", "neither", "never", "next", "no", "none", "nor", "not", "note", "nothing", "now", "number", "of", "off", "often", "oh", "on", "once", "only", "or", "other", "ought", "our", "out", "please", "prepare", "probable", "pull", "pure", "push", "put", "raise", "ran", "rather", "reach", "realize", "reply", "require", "rest", "run", "said", "same", "sat", "saw", "say", "see", "seem", "seen", "self", "sell", "sent", "separate", "set", "shall", "she", "should", "side", "sign", "since", "so", "sold", "some", "soon", "sorry", "stay", "step", "stick", "still", "stood", "such", "sudden", "suppose", "take", "taken", "talk", "tall", "tell", "ten", "than", "thank", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "this", "those", "though", "through", "till", "to", "today", "told", "tomorrow", "too", "took", "tore", "tought", "toward", "tried", "tries", "trust", "try", "turn", "two", "under", "until", "up", "upon", "us", "use", "usual", "various", "verb", "very", "visit", "want", "was", "we", "well", "went", "were", "what", "when", "where", "whether", "which", "while", "white", "who", "whom", "whose", "why", "will", "with", "within", "without", "would", "yes", "yet", "you", "young", "your", "br", "img", "p","lt", "gt", "quot", "copy");
  118. //create an array out of the site contents
  119. $s = split(" ", $this->contents);
  120. //initialize array
  121. $k = array();
  122. //iterate inside the array
  123. foreach( $s as $key=>$val ) {
  124. //delete single or two letter words and
  125. //Add it to the list if the word is not
  126. //contained in the common words list.
  127. if(mb_strlen(trim($val)) >= $this->wordLengthMin && !in_array(trim($val), $common) && !is_numeric(trim($val))) {
  128. $k[] = trim($val);
  129. }
  130. }
  131. //count the words
  132. $k = array_count_values($k);
  133. //sort the words from
  134. //highest count to the
  135. //lowest.
  136. $occur_filtered = $this->occure_filter($k, $this->wordOccuredMin);
  137. arsort($occur_filtered);
  138. $imploded = $this->implode(";; ", $occur_filtered);
  139. //release unused variables
  140. unset($k);
  141. unset($s);
  142. return $imploded;
  143. }
  144. function parse_2words()
  145. {
  146. //create an array out of the site contents
  147. $x = split(" ", $this->contents);
  148. //initilize array
  149. //$y = array();
  150. for ($i=0; $i < count($x)-1; $i++) {
  151. //delete phrases lesser than 5 characters
  152. if( (mb_strlen(trim($x[$i])) >= $this->word2WordPhraseLengthMin ) && (mb_strlen(trim($x[$i+1])) >= $this->word2WordPhraseLengthMin) )
  153. {
  154. $y[] = trim($x[$i])." ".trim($x[$i+1]);
  155. }
  156. }
  157. //count the 2 word phrases
  158. $y = array_count_values($y);
  159. $occur_filtered = $this->occure_filter($y, $this->phrase2WordLengthMinOccur);
  160. //sort the words from highest count to the lowest.
  161. arsort($occur_filtered);
  162. $imploded = $this->implode(";; ", $occur_filtered);
  163. //release unused variables
  164. unset($y);
  165. unset($x);
  166. return $imploded;
  167. }
  168. function parse_3words()
  169. {
  170. //create an array out of the site contents
  171. $a = split(" ", $this->contents);
  172. //initilize array
  173. $b = array();
  174. for ($i=0; $i < count($a)-2; $i++) {
  175. //delete phrases lesser than 5 characters
  176. if( (mb_strlen(trim($a[$i])) >= $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i+1])) > $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i+2])) > $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i]).trim($a[$i+1]).trim($a[$i+2])) > $this->phrase3WordLengthMin) )
  177. {
  178. $b[] = trim($a[$i])." ".trim($a[$i+1])." ".trim($a[$i+2]);
  179. }
  180. }
  181. //count the 3 word phrases
  182. $b = array_count_values($b);
  183. //sort the words from
  184. //highest count to the
  185. //lowest.
  186. $occur_filtered = $this->occure_filter($b, $this->phrase3WordLengthMinOccur);
  187. arsort($occur_filtered);
  188. $imploded = $this->implode(";; ", $occur_filtered);
  189. //release unused variables
  190. unset($a);
  191. unset($b);
  192. return $imploded;
  193. }
  194. function occure_filter($array_count_values, $min_occur)
  195. {
  196. $occur_filtered = array();
  197. foreach ($array_count_values as $word => $occured) {
  198. if ($occured >= $min_occur) {
  199. $occur_filtered[$word] = $occured;
  200. }
  201. }
  202. return $occur_filtered;
  203. }
  204. function implode($gule, $array)
  205. {
  206. $c = "";
  207. foreach($array as $key=>$val) {
  208. @$c .= $key.$gule;
  209. }
  210. return $c;
  211. }
  212. }
  213. ?>