/wp-content/plugins/wordpress-seo/admin/linkdex/TextStatistics.php
PHP | 369 lines | 200 code | 33 blank | 136 comment | 11 complexity | 539e14e24292a33f439c160adf548d2e MD5 | raw file
- <?php
- /*
- TextStatistics Class
- http://code.google.com/p/php-text-statistics/
- Released under New BSD license
- http://www.opensource.org/licenses/bsd-license.php
- Calculates following readability scores (formulae can be found in wiki):
- * Flesch Kincaid Reading Ease
- * Flesch Kincaid Grade Level
- * Gunning Fog Score
- * Coleman Liau Index
- * SMOG Index
- * Automated Reability Index
- Will also give:
- * String length
- * Letter count
- * Syllable count
- * Sentence count
- * Average words per sentence
- * Average syllables per word
-
- Sample Code
- ----------------
- $statistics = new TextStatistics;
- $text = 'The quick brown fox jumped over the lazy dog.';
- echo 'Flesch-Kincaid Reading Ease: ' . $statistics->flesch_kincaid_reading_ease($text);
- Modifications by Yoast
- -----
- Removed all multibyte code references for speed and compatibility
- */
- class TextStatistics {
- protected $strEncoding = ''; // Used to hold character encoding to be used by object, if set
- /**
- * Constructor.
- *
- * @param string $strEncoding Optional character encoding.
- * @return void
- */
- public function __construct($strEncoding = '') {
- if ($strEncoding <> '') {
- // Encoding is given. Use it!
- $this->strEncoding = $strEncoding;
- }
- }
- /**
- * Gives the Flesch-Kincaid Reading Ease of text entered rounded to one digit
- * @param strText Text to be checked
- */
- function flesch_kincaid_reading_ease($strText) {
- $strText = $this->clean_text($strText);
- return round((206.835 - (1.015 * $this->average_words_per_sentence($strText)) - (84.6 * $this->average_syllables_per_word($strText))), 1);
- }
- /**
- * Gives the Flesch-Kincaid Grade level of text entered rounded to one digit
- * @param strText Text to be checked
- */
- function flesch_kincaid_grade_level($strText) {
- $strText = $this->clean_text($strText);
- return round(((0.39 * $this->average_words_per_sentence($strText)) + (11.8 * $this->average_syllables_per_word($strText)) - 15.59), 1);
- }
- /**
- * Gives the Gunning-Fog score of text entered rounded to one digit
- * @param strText Text to be checked
- */
- public function gunning_fog_score($strText) {
- $strText = $this->clean_text($strText);
- return round((($this->average_words_per_sentence($strText) + $this->percentage_words_with_three_syllables($strText, false)) * 0.4), 1);
- }
- /**
- * Gives the Coleman-Liau Index of text entered rounded to one digit
- * @param strText Text to be checked
- */
- public function coleman_liau_index($strText) {
- $strText = $this->clean_text($strText);
- return round( ( (5.89 * ($this->letter_count($strText) / $this->word_count($strText))) - (0.3 * ($this->sentence_count($strText) / $this->word_count($strText))) - 15.8 ), 1);
- }
- /**
- * Gives the SMOG Index of text entered rounded to one digit
- * @param strText Text to be checked
- */
- public function smog_index($strText) {
- $strText = $this->clean_text($strText);
- return round(1.043 * sqrt(($this->words_with_three_syllables($strText) * (30 / $this->sentence_count($strText))) + 3.1291), 1);
- }
- /**
- * Gives the Automated Readability Index of text entered rounded to one digit
- * @param strText Text to be checked
- */
- public function automated_readability_index($strText) {
- $strText = $this->clean_text($strText);
- return round(((4.71 * ($this->letter_count($strText) / $this->word_count($strText))) + (0.5 * ($this->word_count($strText) / $this->sentence_count($strText))) - 21.43), 1);
- }
- /**
- * Gives string length.
- * @param strText Text to be measured
- */
- public function text_length($strText) {
- return strlen( utf8_decode( $strText ) );
- }
- /**
- * Gives letter count (ignores all non-letters).
- * @param strText Text to be measured
- */
- public function letter_count($strText) {
- $strText = $this->clean_text($strText); // To clear out newlines etc
- $intTextLength = 0;
- $strText = preg_replace('/[^A-Za-z]+/', '', $strText);
- $intTextLength = strlen($strText);
- return $intTextLength;
- }
- /**
- * Trims, removes line breaks, multiple spaces and generally cleans text before processing.
- * @param strText Text to be transformed
- */
- protected function clean_text($strText) {
- // all these tags should be preceeded by a full stop.
- $fullStopTags = array('li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd');
- foreach ($fullStopTags as $tag) {
- $strText = str_ireplace('</'.$tag.'>', '.', $strText);
- }
- $strText = strip_tags($strText);
- $strText = preg_replace('/[,:;()-]/', ' ', $strText); // Replace commans, hyphens etc (count them as spaces)
- $strText = preg_replace('/[\.!?]/', '.', $strText); // Unify terminators
- $strText = trim($strText) . '.'; // Add final terminator, just in case it's missing.
- $strText = preg_replace('/[ ]*(\n|\r\n|\r)[ ]*/', ' ', $strText); // Replace new lines with spaces
- $strText = preg_replace('/([\.])[\. ]+/', '$1', $strText); // Check for duplicated terminators
- $strText = trim(preg_replace('/[ ]*([\.])/', '$1 ', $strText)); // Pad sentence terminators
- $strText = preg_replace('/[ ]+/', ' ', $strText); // Remove multiple spaces
- $strText = preg_replace_callback('/\. [^ ]+/', create_function('$matches', 'return strtolower($matches[0]);'), $strText); // Lower case all words following terminators (for gunning fog score)
- return $strText;
- }
- /**
- * Converts string to lower case. Tries mb_strtolower and if that fails uses regular strtolower.
- * @param strText Text to be transformed
- */
- protected function lower_case($strText) {
- return strtolower($strText);
- }
- /**
- * Converts string to upper case. Tries mb_strtoupper and if that fails uses regular strtoupper.
- * @param strText Text to be transformed
- */
- protected function upper_case($strText) {
- return strtoupper($strText);
- }
- /**
- * Gets portion of string. Tries mb_substr and if that fails uses regular substr.
- * @param strText Text to be cut up
- * @param intStart Start character
- * @param intLenght Length
- */
- protected function substring($strText, $intStart, $intLength) {
- return substr($strText, $intStart, $intLength);
- }
- /**
- * Returns sentence count for text.
- * @param strText Text to be measured
- */
- public function sentence_count($strText) {
- $strText = $this->clean_text($strText);
- // Will be tripped up by "Mr." or "U.K.". Not a major concern at this point.
- $intSentences = max(1, $this->text_length(preg_replace('/[^\.!?]/', '', $strText)));
- return $intSentences;
- }
- /**
- * Returns word count for text.
- * @param strText Text to be measured
- */
- public function word_count($strText) {
- $strText = $this->clean_text($strText);
- // Will be tripped by by em dashes with spaces either side, among other similar characters
- $intWords = 1 + $this->text_length(preg_replace('/[^ ]/', '', $strText)); // Space count + 1 is word count
- return $intWords;
- }
- /**
- * Returns average words per sentence for text.
- * @param strText Text to be measured
- */
- public function average_words_per_sentence($strText) {
- $strText = $this->clean_text($strText);
- $intSentenceCount = $this->sentence_count($strText);
- $intWordCount = $this->word_count($strText);
- return ($intWordCount / $intSentenceCount);
- }
- /**
- * Returns average syllables per word for text.
- * @param strText Text to be measured
- */
- public function average_syllables_per_word($strText) {
- $strText = $this->clean_text($strText);
- $intSyllableCount = 0;
- $intWordCount = $this->word_count($strText);
- $arrWords = explode(' ', $strText);
- for ($i = 0; $i < $intWordCount; $i++) {
- $intSyllableCount += $this->syllable_count($arrWords[$i]);
- }
- return ($intSyllableCount / $intWordCount);
- }
- /**
- * Returns the number of words with more than three syllables
- * @param strText Text to be measured
- * @param blnCountProperNouns Boolean - should proper nouns be included in words count
- */
- public function words_with_three_syllables($strText, $blnCountProperNouns = true) {
- $strText = $this->clean_text($strText);
- $intLongWordCount = 0;
- $intWordCount = $this->word_count($strText);
- $arrWords = explode(' ', $strText);
- for ($i = 0; $i < $intWordCount; $i++) {
- if ($this->syllable_count($arrWords[$i]) > 2) {
- if ($blnCountProperNouns) {
- $intLongWordCount++;
- } else {
- $strFirstLetter = $this->substring($arrWords[$i], 0, 1);
- if ($strFirstLetter !== $this->upper_case($strFirstLetter)) {
- // First letter is lower case. Count it.
- $intLongWordCount++;
- }
- }
- }
- }
- return ($intLongWordCount);
- }
- /**
- * Returns the percentage of words with more than three syllables
- * @param strText Text to be measured
- * @param blnCountProperNouns Boolean - should proper nouns be included in words count
- */
- public function percentage_words_with_three_syllables($strText, $blnCountProperNouns = true) {
- $strText = $this->clean_text($strText);
- $intWordCount = $this->word_count($strText);
- $intLongWordCount = $this->words_with_three_syllables($strText, $blnCountProperNouns);
- $intPercentage = (($intLongWordCount / $intWordCount) * 100);
- return ($intPercentage);
- }
- /**
- * Returns the number of syllables in the word.
- * Based in part on Greg Fast's Perl module Lingua::EN::Syllables
- * @param strWord Word to be measured
- */
- public function syllable_count($strWord) {
- $intSyllableCount = 0;
- $strWord = $this->lower_case($strWord);
- // Specific common exceptions that don't follow the rule set below are handled individually
- // Array of problem words (with word as key, syllable count as value)
- $arrProblemWords = Array(
- 'simile' => 3
- ,'forever' => 3
- ,'shoreline' => 2
- );
- if (isset($arrProblemWords[$strWord])) {
- $intSyllableCount = $arrProblemWords[$strWord];
- }
- if ($intSyllableCount > 0) {
- return $intSyllableCount;
- }
- // These syllables would be counted as two but should be one
- $arrSubSyllables = Array(
- 'cial'
- ,'tia'
- ,'cius'
- ,'cious'
- ,'giu'
- ,'ion'
- ,'iou'
- ,'sia$'
- ,'[^aeiuoyt]{2,}ed$'
- ,'.ely$'
- ,'[cg]h?e[rsd]?$'
- ,'rved?$'
- ,'[aeiouy][dt]es?$'
- ,'[aeiouy][^aeiouydt]e[rsd]?$'
- ,'^[dr]e[aeiou][^aeiou]+$' // Sorts out deal, deign etc
- ,'[aeiouy]rse$' // Purse, hearse
- );
- // These syllables would be counted as one but should be two
- $arrAddSyllables = Array(
- 'ia'
- ,'riet'
- ,'dien'
- ,'iu'
- ,'io'
- ,'ii'
- ,'[aeiouym]bl$'
- ,'[aeiou]{3}'
- ,'^mc'
- ,'ism$'
- ,'([^aeiouy])\1l$'
- ,'[^l]lien'
- ,'^coa[dglx].'
- ,'[^gq]ua[^auieo]'
- ,'dnt$'
- ,'uity$'
- ,'ie(r|st)$'
- );
- // Single syllable prefixes and suffixes
- $arrPrefixSuffix = Array(
- '/^un/'
- ,'/^fore/'
- ,'/ly$/'
- ,'/less$/'
- ,'/ful$/'
- ,'/ers?$/'
- ,'/ings?$/'
- );
- // Remove prefixes and suffixes and count how many were taken
- $strWord = preg_replace($arrPrefixSuffix, '', $strWord, -1, $intPrefixSuffixCount);
- // Removed non-word characters from word
- $strWord = preg_replace('/[^a-z]/is', '', $strWord);
- $arrWordParts = preg_split('/[^aeiouy]+/', $strWord);
- $intWordPartCount = 0;
- foreach ($arrWordParts as $strWordPart) {
- if ($strWordPart <> '') {
- $intWordPartCount++;
- }
- }
- // Some syllables do not follow normal rules - check for them
- // Thanks to Joe Kovar for correcting a bug in the following lines
- $intSyllableCount = $intWordPartCount + $intPrefixSuffixCount;
- foreach ($arrSubSyllables as $strSyllable) {
- $intSyllableCount -= preg_match('~' . $strSyllable . '~', $strWord);
- }
- foreach ($arrAddSyllables as $strSyllable) {
- $intSyllableCount += preg_match('~' . $strSyllable . '~', $strWord);
- }
- $intSyllableCount = ($intSyllableCount == 0) ? 1 : $intSyllableCount;
- return $intSyllableCount;
- }
- }
- ?>