po = $po; } /** * @internal */ private function countField( $f ){ $n = 0; foreach( $this->po as $r ){ $n += self::simpleCount( $r[$f] ); } return $n; } /** * Default count function returns source words (msgid) in current file. * @return int */ public function count(){ $n = $this->sw; if( is_null($n) ){ $n = $this->countField('source'); $this->sw = $n; } return $n; } /** * Very simple word count, only suitable for latin characters, and biased toward English. * @return int */ public static function simpleCount( $str ){ $n = 0; if( isset($str{0}) ){ // TODO should we strip PHP string formatting? // e.g. "Hello %s" currently counts as 2 words. // $str = preg_replace('/%(?:\\d+\\$)?(?:\'.|[-+0 ])*\\d*(?:\\.\\d+)?[suxXbcdeEfFgGo%]/', '', $str ); // Strip HTML (but only if open and close tags detected, else "< foo" would be stripped to nothing if( false !== strpos($str,'<') && false !== strpos($str,'>') ){ $str = strip_tags($str); } // always html-decode, else escaped punctuation will be counted as words $str = html_entity_decode( $str, ENT_QUOTES, 'UTF-8'); // Collapsing apostrophe'd words into single units: // Simplest way to handle ambiguity of "It's Tim's" (technically three words in English) $str = preg_replace('/(\\w+)\'(\\w)(\\W|$)/u', '\\1\\2\\3', $str ); // Combining floating numbers into single units // e.g. "£1.50" and "€1,50" should be one word each $str = preg_replace('/\\d[\\d,\\.]+/', '0', $str ); // count words by standard Unicode word boundaries $words = preg_split( '/\\W+/u', $str, -1, PREG_SPLIT_NO_EMPTY ); $n += count($words); /*/ TODO should we exclude some words (like numbers)? foreach( $words as $word ){ if( ! ctype_digit($word) ){ $n++; } }*/ } return $n; } }