/mod/metadatalom/class.autokeyword.php
PHP | 249 lines | 118 code | 30 blank | 101 comment | 12 complexity | 51e4ea32be3be5981a3c41a236431706 MD5 | raw file
Possible License(s): Apache-2.0, GPL-3.0
- <?php
- /*
- Class updated by Vitor Gonçalves, August 2006.
- contact: vg_AT_ipb.pt
- Changes: Integration with field keywords in module Metadata for Moodle
- Original Projectname: Automatic Keyword Generator
- Version: 0.3
- Author: Ver Pangonilo <smp_AT_itsp.info>
- Last modified: 26 July 2006
- Copyright (C): 2006 Ver Pangonilo, All Rights Reserved
-
- * GNU General Public License (Version 2, June 1991)
- *
- * This program is free software; you can redistribute
- * it and/or modify it under the terms of the GNU
- * General Public License as published by the Free
- * Software Foundation; either version 2 of the License,
- * or (at your option) any later version.
- *
- * This program is distributed in the hope that it will
- * be useful, but WITHOUT ANY WARRANTY; without even the
- * implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the GNU General Public License
- * for more details.
-
- Description:
- This class can generates automatically META Keywords for your
- web pages based on the contents of your articles. This will
- eliminate the tedious process of thinking what will be the best
- keywords that suits your article. The basis of the keyword
- generation is the number of iterations any word or phrase
- occured within an article.
-
- This automatic keyword generator will create single words,
- two word phrase and three word phrases. Single words will be
- filtered from a common words list.
-
- Change Log:
- ===========
- 0.2 Ver Pangonilo - 22 July 2005
- ================================
- Added user configurable parameters and commented codes
- for easier end user understanding.
-
- 0.3 Vasilich (vasilich_AT_grafin.kiev.ua) - 26 July 2006
- =========================================================
- Added encoding parameter to work with UTF texts, min number
- of the word/phrase occurrences,
-
- ******************************************************************/
-
- class autokeyword {
-
- //declare variables
- //the site contents
- var $contents;
- var $encoding;
- //the generated keywords
- var $keywords;
- //minimum word length for inclusion into the single word
- //metakeys
- var $wordLengthMin;
- var $wordOccuredMin;
- //minimum word length for inclusion into the 2 word
- //phrase metakeys
- var $word2WordPhraseLengthMin;
- var $phrase2WordLengthMinOccur;
- //minimum word length for inclusion into the 3 word
- //phrase metakeys
- var $word3WordPhraseLengthMin;
- //minimum phrase length for inclusion into the 2 word
- //phrase metakeys
- var $phrase2WordLengthMin;
- var $phrase3WordLengthMinOccur;
- //minimum phrase length for inclusion into the 3 word
- //phrase metakeys
- var $phrase3WordLengthMin;
-
- function autokeyword($params, $encoding)
- {
- //get parameters
- $this->encoding = $encoding;
- mb_internal_encoding($encoding);
- $this->contents = $this->replace_chars($params['content']);
-
- // single word
- $this->wordLengthMin = $params['min_word_length'];
- $this->wordOccuredMin = $params['min_word_occur'];
-
- // 2 word phrase
- $this->word2WordPhraseLengthMin = $params['min_2words_length'];
- $this->phrase2WordLengthMin = $params['min_2words_phrase_length'];
- $this->phrase2WordLengthMinOccur = $params['min_2words_phrase_occur'];
-
- // 3 word phrase
- $this->word3WordPhraseLengthMin = $params['min_3words_length'];
- $this->phrase3WordLengthMin = $params['min_3words_phrase_length'];
- $this->phrase3WordLengthMinOccur = $params['min_3words_phrase_occur'];
-
- //parse single, two words and three words
-
- }
-
- function get_keywords()
- {
- $keywords = $this->parse_words().$this->parse_2words().$this->parse_3words();
- return substr($keywords, 0, -2);
- }
-
- //turn the site contents into an array
- //then replace common html tags.
- function replace_chars($content)
- {
- //convert all characters to lower case
- //$content = mb_strtolower($content);
- $content = mb_strtolower($content, "UTF-8");
- $content = strip_tags($content);
-
- $punctuations = array(',', ')', '(', '.', "'", '"',
- '<', '>', ';', '!', '?', '/', '-',
- '_', '[', ']', ':', '+', '=', '#',
- '$', '"', '©', '>', '<',
- chr(10), chr(13), chr(9));
-
- $content = str_replace($punctuations, " ", $content);
- // replace multiple gaps
- $content = preg_replace('/ {2,}/si', " ", $content);
-
- return $content;
- }
-
- //single words META KEYWORDS
- function parse_words()
- {
- //list of commonly used words
- // this can be edited to suit your needs
- $common = array("e", "apenas","permite","permitir","está","todo","todos","neste","nesta","capaz", "acima", "actua", "soma", "medo", "após", "depois", "outra", "outro", "outras", "outros", "podem", "vez", "contra", "idade", "há", "concorda", "tudo", "quase", "sozinho", "só", "longo", "de", "da", "do", "já", "agora", "também", "embora", "contudo", "entanto", "sempre", "meio", "quantidade", "olá", "e", "raiva", "irritado", "resposta", "alguns", "aparece", "questão", "é", "chega", "braço", "braços", "através", "chega", "como", "pedir", "em", "onde", "tentativa", "tia", "ausente", "talvez", "mau", "saco", "uma", "esteja", "se", "porque", "esse", "sido", "antes", "começou", "começa", "atrás", "sendo", "dessa", "desse", "pertence", "abaixo", "lado", "mais", "ao", "à", "às", "ás", "melhor", "pior", "entre", "além", "grande","corpo", "osso", "carregado","pede", "ambos", "fundo", "caixa", "menino", "ruptura", "traz", "trazido", "erro", "construído", "ocupado", "mas", "compra", "por", "chamada", "veio", "pode", "causa", "escolhe", "próximo", "anterior", "considera", "vindo", "considerável", "contêm", "continuam", "poderia", "grito", "corte", "desafio", "pelo", "pela", "caro", "barato", "profundo", "sim", "dado", "não", "pois", "cão", "feito", "dúvida", "baixo", "durante", "cada", "um", "uma", "come", "qualquer", "alguma", "que", "a", "o", "extremidade", "aprecia", "bastante", "entrada", "mesmo", "excepto", "espera", "explica", "falha", "queda", "distante", "gordura", "favor", "sensação", "pés", "caiu", "sentido", "poucos", "muitos", "achado", "mosca", "segue", "para", "esquece-se", "parte", "dianteira", "deu", "começa", "dá", "vai", "ido", "bom", "começado", "cinza", "enorme", "verde", "cresceu", "cresce", "suposição", "teve", "metade", "cair", "acontece", "tem", "chapéu", "têm", "ele", "ela", "eu", "tu", "ouve", "ouvido", "preso", "conseguinte", "ajuda", "aqui", "dela", "elevado", "monte", "dele", "dela", "batida", "porquê", "quente", "frio", "entretanto", "aquando", "se", "mal", "bem", "certamente", "preferivelmente", "certo", "errado", "fosse", "essas", "seu", "sua", "justo", "suas", "manter", "soube", "sabe", "sabido", "atrasado", "menos", "conduzir", "sair", "emprestar", "menos", "deixar", "comer", "provável", "gosto", "solitário", "longo", "olhar", "lote", "fazem", "enormes", "ora", "mim", "ainda", "encontrado", "opss", "milha", "mina", "lua", "sol", "movimento", "nunca", "meu", "minha", "perto", "quase", "necessário", "nem", "nunca", "seguinte", "nenhum", "nenhuns", "nem", "nota", "nada", "número", "frequentemente", " ah", "ps", "somente", "ou", "principalmente", "nossa", "nosso", "fora", "por favor", "preparam", "provável", "puro", "funcionaram", "alcance", "realizam", "requerem", "descanso", "disse", "sentado", "palavra","parece", "visto", "emitido", "separado", "deve", "desde", "assim", "vendido", "alguns", "logo", "pesaroso", "etapa", "vara", "tais", "tal", "supõe", "tomada", "feito", "conversa", "alto", "diz", "dez", "agradecer", "obrigado", "eles", "então", "lá", "consequentemente", "estes", "eles", "isto", "aqueles", "aquilo", "até", "aquelas", "estas", "hoje", "amanhã", "demasiado", "rasgou", "volta", "dois", "sob", "sobre", "nós", "vós", "uso", "usual", "vários", "diversos", "visita", "querem", "era", "fomos", "éramos", "quanto", "quantos", "onde", "quantos", "quantas", "branco", "quem", "quais", "qual", "cujo", "com", "dentro", "sem", "com", "tanto", "você", "cópia", "able", "about", "above", "act", "add", "afraid", "after", "again", "against", "age", "ago", "agree", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "amount", "an", "and", "anger", "angry", "animal", "another", "answer", "any", "appear", "apple", "are", "arrive", "arm", "arms", "around", "arrive", "as", "ask", "at", "attempt", "aunt", "away", "back", "bad", "bag", "bay", "be", "became", "because", "become", "been", "before", "began", "begin", "behind", "being", "bell", "belong", "below", "beside", "best", "better", "between", "beyond", "big", "body", "bone", "born", "borrow", "both", "bottom", "box", "boy", "break", "bring", "brought", "bug", "built", "busy", "but", "buy", "by", "call", "came", "can", "cause", "choose", "close", "close", "consider", "come", "consider", "considerable", "contain", "continue", "could", "cry", "cut", "dare", "dark", "deal", "dear", "decide", "deep", "did", "die", "do", "does", "dog", "done", "doubt", "down", "during", "each", "ear", "early", "eat", "effort", "either", "else", "end", "enjoy", "enough", "enter", "even", "ever", "every", "except", "expect", "explain", "fail", "fall", "far", "fat", "favor", "fear", "feel", "feet", "fell", "felt", "few", "fill", "find", "fit", "fly", "follow", "for", "forever", "forget", "from", "front", "gave", "get", "gives", "goes", "gone", "good", "got", "gray", "great", "green", "grew", "grow", "guess", "had", "half", "hang", "happen", "has", "hat", "have", "he", "hear", "heard", "held", "hello", "help", "her", "here", "hers", "high", "hill", "him", "his", "hit", "hold", "hot", "how", "however", "I", "if", "ill", "in", "indeed", "instead", "into", "iron", "is", "it", "its", "just", "keep", "kept", "knew", "know", "known", "late", "least", "led", "left", "lend", "less", "let", "like", "likely", "likr", "lone", "long", "look", "lot", "make", "many", "may", "me", "mean", "met", "might", "mile", "mine", "moon", "more", "most", "move", "much", "must", "my", "near", "nearly", "necessary", "neither", "never", "next", "no", "none", "nor", "not", "note", "nothing", "now", "number", "of", "off", "often", "oh", "on", "once", "only", "or", "other", "ought", "our", "out", "please", "prepare", "probable", "pull", "pure", "push", "put", "raise", "ran", "rather", "reach", "realize", "reply", "require", "rest", "run", "said", "same", "sat", "saw", "say", "see", "seem", "seen", "self", "sell", "sent", "separate", "set", "shall", "she", "should", "side", "sign", "since", "so", "sold", "some", "soon", "sorry", "stay", "step", "stick", "still", "stood", "such", "sudden", "suppose", "take", "taken", "talk", "tall", "tell", "ten", "than", "thank", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "this", "those", "though", "through", "till", "to", "today", "told", "tomorrow", "too", "took", "tore", "tought", "toward", "tried", "tries", "trust", "try", "turn", "two", "under", "until", "up", "upon", "us", "use", "usual", "various", "verb", "very", "visit", "want", "was", "we", "well", "went", "were", "what", "when", "where", "whether", "which", "while", "white", "who", "whom", "whose", "why", "will", "with", "within", "without", "would", "yes", "yet", "you", "young", "your", "br", "img", "p","lt", "gt", "quot", "copy");
- //create an array out of the site contents
- $s = split(" ", $this->contents);
- //initialize array
- $k = array();
- //iterate inside the array
- foreach( $s as $key=>$val ) {
- //delete single or two letter words and
- //Add it to the list if the word is not
- //contained in the common words list.
- if(mb_strlen(trim($val)) >= $this->wordLengthMin && !in_array(trim($val), $common) && !is_numeric(trim($val))) {
- $k[] = trim($val);
- }
- }
- //count the words
- $k = array_count_values($k);
- //sort the words from
- //highest count to the
- //lowest.
- $occur_filtered = $this->occure_filter($k, $this->wordOccuredMin);
- arsort($occur_filtered);
-
- $imploded = $this->implode(";; ", $occur_filtered);
- //release unused variables
- unset($k);
- unset($s);
-
- return $imploded;
- }
-
- function parse_2words()
- {
- //create an array out of the site contents
- $x = split(" ", $this->contents);
- //initilize array
-
- //$y = array();
- for ($i=0; $i < count($x)-1; $i++) {
- //delete phrases lesser than 5 characters
- if( (mb_strlen(trim($x[$i])) >= $this->word2WordPhraseLengthMin ) && (mb_strlen(trim($x[$i+1])) >= $this->word2WordPhraseLengthMin) )
- {
- $y[] = trim($x[$i])." ".trim($x[$i+1]);
- }
- }
-
- //count the 2 word phrases
- $y = array_count_values($y);
-
- $occur_filtered = $this->occure_filter($y, $this->phrase2WordLengthMinOccur);
- //sort the words from highest count to the lowest.
- arsort($occur_filtered);
-
- $imploded = $this->implode(";; ", $occur_filtered);
- //release unused variables
- unset($y);
- unset($x);
-
- return $imploded;
- }
-
- function parse_3words()
- {
- //create an array out of the site contents
- $a = split(" ", $this->contents);
- //initilize array
- $b = array();
-
- for ($i=0; $i < count($a)-2; $i++) {
- //delete phrases lesser than 5 characters
- if( (mb_strlen(trim($a[$i])) >= $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i+1])) > $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i+2])) > $this->word3WordPhraseLengthMin) && (mb_strlen(trim($a[$i]).trim($a[$i+1]).trim($a[$i+2])) > $this->phrase3WordLengthMin) )
- {
- $b[] = trim($a[$i])." ".trim($a[$i+1])." ".trim($a[$i+2]);
- }
- }
-
- //count the 3 word phrases
- $b = array_count_values($b);
- //sort the words from
- //highest count to the
- //lowest.
- $occur_filtered = $this->occure_filter($b, $this->phrase3WordLengthMinOccur);
- arsort($occur_filtered);
-
- $imploded = $this->implode(";; ", $occur_filtered);
- //release unused variables
- unset($a);
- unset($b);
-
- return $imploded;
- }
-
- function occure_filter($array_count_values, $min_occur)
- {
- $occur_filtered = array();
- foreach ($array_count_values as $word => $occured) {
- if ($occured >= $min_occur) {
- $occur_filtered[$word] = $occured;
- }
- }
-
- return $occur_filtered;
- }
-
- function implode($gule, $array)
- {
- $c = "";
- foreach($array as $key=>$val) {
- @$c .= $key.$gule;
- }
- return $c;
- }
- }
- ?>