TextStatistics.php | searchcode

/html/blog/wp-content/plugins/wordpress-seo/admin/TextStatistics.php

https://github.com/jimmytidey/jimmytidey.co.uk
PHP | 414 lines | 237 code | 49 blank | 128 comment | 33 complexity | 9c9abfd71650533c7a08bcfdc7af15f2 MD5 | raw file

<?php
/**
 * @package Admin
 */

if ( ! defined( 'WPSEO_VERSION' ) ) {
	header( 'Status: 403 Forbidden' );
	header( 'HTTP/1.1 403 Forbidden' );
	exit();
}

if ( ! class_exists( 'Yoast_TextStatistics' ) ) {
	/**
	 * Modified (Reduced) TextStatistics Class
	 *
	 * Mostly removed functionality that isn't needed within the WordPress SEO plugin.
	 *
	 * @link    http://code.google.com/p/php-text-statistics/
	 * @link	https://github.com/DaveChild/Text-Statistics (new repo location)
	 * @license http://www.opensource.org/licenses/bsd-license.php New BSD license
	 *
	 * @todo [JRF => whomever] Research if a class/library can be found which will offer
	 * this functionality to a broader scope of languages/charsets.
	 * Now basically limited to English.
	 */
	class Yoast_TextStatistics {

		/**
		 * @var string $strEncoding Used to hold character encoding to be used by object, if set
		 */
		protected $strEncoding = '';

		/**
		 * @var string $blnMbstring Efficiency: Is the MB String extension loaded ?
		 */
		protected $blnMbstring = true;

		/**
		 * @var bool $normalize Should the result be normalized ?
		 */
		public $normalize = true;


		/**
		 * Constructor.
		 *
		 * @param string  $strEncoding    Optional character encoding.
		 */
		public function __construct( $strEncoding = '' ) {
			if ( $strEncoding <> '' ) {
				// Encoding is given. Use it!
				$this->strEncoding = $strEncoding;
			}
			$this->blnMbstring = extension_loaded( 'mbstring' );
		}

		/**
		 * Gives the Flesch-Kincaid Reading Ease of text entered rounded to one digit
		 *
		 * @param  string $strText         Text to be checked
		 * @return int|float
		 */
		public function flesch_kincaid_reading_ease( $strText ) {
			$strText = $this->clean_text( $strText );
			$score   = wpseo_calc( wpseo_calc( 206.835, '-', wpseo_calc( 1.015, '*', $this->average_words_per_sentence( $strText ) ) ), '-', wpseo_calc( 84.6, '*', $this->average_syllables_per_word( $strText ) ) );

			return $this->normalize_score( $score, 0, 100 );
		}

		/**
		 * Gives string length.
		 *
		 * @param  string $strText Text to be measured
		 *
		 * @return int
		 */
		public function text_length( $strText ) {
			if ( ! $this->blnMbstring ) {
				return strlen( $strText );
			}

			try {
				if ( $this->strEncoding == '' ) {
					$intTextLength = mb_strlen( $strText );
				} else {
					$intTextLength = mb_strlen( $strText, $this->strEncoding );
				}
			} catch ( Exception $e ) {
				$intTextLength = strlen( $strText );
			}

			return $intTextLength;
		}

		/**
		 * Gives letter count (ignores all non-letters). Tries mb_strlen and if that fails uses regular strlen.
		 *
		 * @param string $strText Text to be measured
		 *
		 * @return int
		 */
		public function letter_count( $strText ) {
			$strText       = $this->clean_text( $strText ); // To clear out newlines etc
			$strText       = preg_replace( '`[^A-Za-z]+`', '', $strText );

			if ( ! $this->blnMbstring ) {
				return strlen( $strText );
			}

			try {
				if ( $this->strEncoding == '' ) {
					$intTextLength = mb_strlen( $strText );
				} else {
					$intTextLength = mb_strlen( $strText, $this->strEncoding );
				}
			} catch ( Exception $e ) {
				$intTextLength = strlen( $strText );
			}

			return $intTextLength;
		}

		/**
		 * Trims, removes line breaks, multiple spaces and generally cleans text before processing.
		 *
		 * @param string $strText      Text to be transformed
		 * @return string
		 */
		protected function clean_text( $strText ) {
			static $clean = array();

			$key = sha1( $strText );

			if ( isset( $clean[$key] ) ) {
				return $clean[$key];
			}

			// all these tags should be preceeded by a full stop.
			$fullStopTags = array( 'li', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'dd' );
			foreach ( $fullStopTags as $tag ) {
				$strText = str_ireplace( '</' . $tag . '>', '.', $strText );
			}
			$strText = strip_tags( $strText );
			$strText = preg_replace( '`[",:;\(\)-]`', ' ', $strText ); // Replace commas, hyphens etc (count them as spaces)
			$strText = preg_replace( '`[\.!?]`', '.', $strText ); // Unify terminators
			$strText = trim( $strText ) . '.'; // Add final terminator, just in case it's missing.
			$strText = preg_replace( '`[ ]*(\n|\r\n|\r)[ ]*`', ' ', $strText ); // Replace new lines with spaces
			$strText = preg_replace( '`([\.])[\. ]+`', '$1', $strText ); // Check for duplicated terminators
			$strText = trim( preg_replace( '`[ ]*([\.])`', '$1 ', $strText ) ); // Pad sentence terminators
			$strText = preg_replace( '` [0-9]+ `', ' ', ' ' . $strText . ' ' ); // Remove "words" comprised only of numbers
			$strText = preg_replace( '`[ ]+`', ' ', $strText ); // Remove multiple spaces
			$strText = preg_replace_callback( '`\. [^ ]+?`', create_function( '$matches', 'return strtolower( $matches[0] );' ), $strText ); // Lower case all words following terminators (for gunning fog score)

			$strText = trim( $strText );

			// Cache it and return
			$clean[$key] = $strText;
			return $strText;
		}

		/**
		 * Converts string to lower case. Tries mb_strtolower and if that fails uses regular strtolower.
		 *
		 * @param string $strText      Text to be transformed
		 * @return string
		 */
		protected function lower_case( $strText ) {

			if ( ! $this->blnMbstring ) {
				return strtolower( $strText );
			}

			try {
				if ( $this->strEncoding == '' ) {
					$strLowerCaseText = mb_strtolower( $strText );
				} else {
					$strLowerCaseText = mb_strtolower( $strText, $this->strEncoding );
				}
			} catch ( Exception $e ) {
				$strLowerCaseText = strtolower( $strText );
			}

			return $strLowerCaseText;
		}

		/**
		 * Converts string to upper case. Tries mb_strtoupper and if that fails uses regular strtoupper.
		 *
		 * @param string $strText      Text to be transformed
		 * @return string
		 */
		protected function upper_case( $strText ) {
			if ( ! $this->blnMbstring ) {
				return strtoupper( $strText );
			}

			try {
				if ( $this->strEncoding == '' ) {
					$strUpperCaseText = mb_strtoupper( $strText );
				} else {
					$strUpperCaseText = mb_strtoupper( $strText, $this->strEncoding );
				}
			} catch ( Exception $e ) {
				$strUpperCaseText = strtoupper( $strText );
			}

			return $strUpperCaseText;
		}

		/**
		 * Returns sentence count for text.
		 *
		 * @param   string $strText      Text to be measured
		 * @return int
		 */
		public function sentence_count( $strText ) {
			if ( strlen( trim( $strText ) ) == 0 ) {
				return 0;
			}

			$strText = $this->clean_text( $strText );
			// Will be tripped up by "Mr." or "U.K.". Not a major concern at this point.
			// [JRF] Will also be tripped up by ... or ?!
			// @todo [JRF => whomever] May be replace with something along the lines of this - will at least provide better count in ... and ?! situations:
			// $intSentences = max( 1, preg_match_all( '`[^\.!?]+[\.!?]+([\s]+|$)`u', $strText, $matches ) ); [/JRF]
			$intSentences = max( 1, $this->text_length( preg_replace( '`[^\.!?]`', '', $strText ) ) );
			return $intSentences;
		}

		/**
		 * Returns word count for text.
		 *
		 * @param  string $strText      Text to be measured
		 * @return int
		 */
		public function word_count( $strText ) {
			if ( strlen( trim( $strText ) ) == 0 ) {
				return 0;
			}

			$strText = $this->clean_text( $strText );
			// Will be tripped by em dashes with spaces either side, among other similar characters
			$intWords = 1 + $this->text_length( preg_replace( '`[^ ]`', '', $strText ) ); // Space count + 1 is word count
			return $intWords;
		}

		/**
		 * Returns average words per sentence for text.
		 *
		 * @param string $strText      Text to be measured
		 * @return int|float
		 */
		public function average_words_per_sentence( $strText ) {
			$strText          = $this->clean_text( $strText );
			$intSentenceCount = $this->sentence_count( $strText );
			$intWordCount     = $this->word_count( $strText );
			return ( wpseo_calc( $intWordCount, '/', $intSentenceCount ) );
		}

		/**
		 * Returns average syllables per word for text.
		 *
		 * @param string  $strText      Text to be measured
		 * @return int|float
		 */
		public function average_syllables_per_word( $strText ) {
			$strText          = $this->clean_text( $strText );
			$intSyllableCount = 0;
			$intWordCount     = $this->word_count( $strText );
			$arrWords         = explode( ' ', $strText );
			for ( $i = 0; $i < $intWordCount; $i++ ) {
				$intSyllableCount += $this->syllable_count( $arrWords[$i] );
			}
			return ( wpseo_calc( $intSyllableCount, '/', $intWordCount ) );
		}

		/**
		 * Returns the number of syllables in the word.
		 * Based in part on Greg Fast's Perl module Lingua::EN::Syllables
		 *
		 * @param string  $strWord Word to be measured
		 * @return int
		 */
		public function syllable_count( $strWord ) {
			if ( strlen( trim( $strWord ) ) == 0 ) {
				return 0;
			}

			// Should be no non-alpha characters
			$strWord = preg_replace( '`[^A-Za-z]`', '', $strWord );

			$intSyllableCount = 0;
			$strWord          = $this->lower_case( $strWord );

			// Specific common exceptions that don't follow the rule set below are handled individually
			// Array of problem words (with word as key, syllable count as value)
			$arrProblemWords = array(
				'simile'    => 3,
				'forever'   => 3,
				'shoreline' => 2,
			);
			if ( isset( $arrProblemWords[$strWord] ) ) {
				$intSyllableCount = $arrProblemWords[$strWord];
			}
			if ( $intSyllableCount > 0 ) {
				return $intSyllableCount;
			}

			// These syllables would be counted as two but should be one
			$arrSubSyllables = array(
				'cial',
				'tia',
				'cius',
				'cious',
				'giu',
				'ion',
				'iou',
				'sia$',
				'[^aeiuoyt]{2,}ed$',
				'.ely$',
				'[cg]h?e[rsd]?$',
				'rved?$',
				'[aeiouy][dt]es?$',
				'[aeiouy][^aeiouydt]e[rsd]?$',
				'^[dr]e[aeiou][^aeiou]+$', // Sorts out deal, deign etc
				'[aeiouy]rse$', // Purse, hearse
			);

			// These syllables would be counted as one but should be two
			$arrAddSyllables = array(
				'ia',
				'riet',
				'dien',
				'iu',
				'io',
				'ii',
				'[aeiouym]bl$',
				'[aeiou]{3}',
				'^mc',
				'ism$',
				'([^aeiouy])\1l$',
				'[^l]lien',
				'^coa[dglx].',
				'[^gq]ua[^auieo]',
				'dnt$',
				'uity$',
				'ie(r|st)$',
			);

			// Single syllable prefixes and suffixes
			$arrPrefixSuffix = array(
				'`^un`',
				'`^fore`',
				'`ly$`',
				'`less$`',
				'`ful$`',
				'`ers?$`',
				'`ings?$`',
			);

			// Remove prefixes and suffixes and count how many were taken
			$strWord = preg_replace( $arrPrefixSuffix, '', $strWord, -1, $intPrefixSuffixCount );

			// Removed non-word characters from word
			$strWord          = preg_replace( '`[^a-z]`is', '', $strWord );
			$arrWordParts     = preg_split( '`[^aeiouy]+`', $strWord );
			$intWordPartCount = 0;
			foreach ( $arrWordParts as $strWordPart ) {
				if ( $strWordPart <> '' ) {
					$intWordPartCount++;
				}
			}

			// Some syllables do not follow normal rules - check for them
			// Thanks to Joe Kovar for correcting a bug in the following lines
			$intSyllableCount = $intWordPartCount + $intPrefixSuffixCount;
			foreach ( $arrSubSyllables as $strSyllable ) {
				$intSyllableCount -= preg_match( '`' . $strSyllable . '`', $strWord );
			}
			foreach ( $arrAddSyllables as $strSyllable ) {
				$intSyllableCount += preg_match( '`' . $strSyllable . '`', $strWord );
			}
			$intSyllableCount = ( $intSyllableCount == 0 ) ? 1 : $intSyllableCount;
			return $intSyllableCount;
		}

		/**
		 * Normalizes score according to min & max allowed. If score larger
		 * than max, max is returned. If score less than min, min is returned.
		 * Also rounds result to specified precision.
		 * Thanks to github.com/lvil.
		 *
		 * @param	int|float  $score	Initial score
		 * @param	int 	   $min 	Minimum score allowed
		 * @param	int 	   $max 	Maximum score allowed
		 * @return	int|float
		 */
		public function normalize_score( $score, $min, $max, $dps = 1 ) {
			$score = wpseo_calc( $score, '+', 0, true, $dps ); // Round
			if ( ! $this->normalize ) {
				return $score;
			}

			if ( $score > $max ) {
				$score = $max;
			} elseif ( $score < $min ) {
				$score = $min;
			}

			return $score;
		}

	} /* End of class */
} /* End of class-exists wrapper */