Current File : /home/jvzmxxx/wiki/extensions/Wikibase/lib/includes/StringNormalizer.php
<?php

namespace Wikibase;

use UtfNormal\Validator;

/**
 * StringNormalizer provides several methods for normalizing strings.
 *
 * Copyright © 03.07.13 by the authors listed below.
 *
 * @since 0.4
 *
 * @license GPL-2.0+
 * @author Daniel Kinzler
 * @author John Erling Blad < jeblad@gmail.com >
 */
class StringNormalizer {

	/**
	 * Remove bytes that represent an incomplete Unicode character
	 * at the end of string (e.g. bytes of the char are missing)
	 *
	 * @todo: this was stolen from the Language class. Make that code reusable.
	 *
	 * @param string $string
	 *
	 * @return string
	 */
	protected function removeBadCharLast( $string ) {
		if ( $string != '' ) {
			$char = ord( $string[strlen( $string ) - 1] );
			$m = array();
			if ( $char >= 0xc0 ) {
				# We got the first byte only of a multibyte char; remove it.
				$string = substr( $string, 0, -1 );
			} elseif ( $char >= 0x80 &&
				preg_match( '/^(.*)(?:[\xe0-\xef][\x80-\xbf]|' .
					'[\xf0-\xf7][\x80-\xbf]{1,2})$/', $string, $m )
			) {
				# We chopped in the middle of a character; remove it
				$string = $m[1];
			}
		}
		return $string;
	}

	/**
	 * Remove bytes that represent an incomplete Unicode character
	 * at the start of string (e.g. bytes of the char are missing)
	 *
	 * @todo: this was stolen from the Language class. Make that code reusable.
	 *
	 * @param string $string
	 *
	 * @return string
	 */
	protected function removeBadCharFirst( $string ) {
		if ( $string != '' ) {
			$char = ord( $string[0] );
			if ( $char >= 0x80 && $char < 0xc0 ) {
				# We chopped in the middle of a character; remove the whole thing
				$string = preg_replace( '/^[\x80-\xbf]+/', '', $string );
			}
		}
		return $string;
	}

	/**
	 * Remove incomplete UTF-8 sequences from the beginning and end of the string.
	 *
	 * @param string $string
	 *
	 * @return string
	 */
	public function trimBadChars( $string ) {
		$string = $this->removeBadCharFirst( $string );
		$string = $this->removeBadCharLast( $string );
		return $string;
	}

	/**
	 * Trim initial and trailing whitespace and control chars, and compress some internal control chars.
	 *
	 * @param string $inputString The actual string to process.
	 *
	 * @return string where whitespace possibly are removed.
	 */
	public function trimWhitespace( $inputString ) {
		$inputString = $this->trimBadChars( $inputString );

		// WARNING: *any* invalid UTF8 sequence causes preg_replace to return an empty string.
		// \p{Cc} only includes general control characters.
		$trimmed = preg_replace( '/\p{Cc}+/u', ' ', $inputString );
		// \p{Z} includes all whitespace characters and invisible separators.
		$trimmed = preg_replace( '/^\p{Z}+|\p{Z}+$/u', '', $trimmed );
		return $trimmed;
	}

	/**
	 * Normalize string into NFC by using the cleanup method from UtfNormal.
	 *
	 * @param string $inputString The actual string to process.
	 *
	 * @return string
	 */
	public function cleanupToNFC( $inputString ) {
		$cleaned = $inputString;
		$cleaned = $this->trimBadChars( $cleaned );
		$cleaned = Validator::cleanUp( $cleaned );
		return $cleaned;
	}

	/**
	 * Do a cleanupToNFC after the string is trimmed
	 *
	 * @param string $inputString
	 *
	 * @return string on NFC form
	 */
	public function trimToNFC( $inputString ) {
		return $this->cleanupToNFC( $this->trimWhitespace( $inputString ) );
	}

}