// Percent encode anything invalid or not in ucschar
// Non-shortest form sequences are invalid
|| $length > 1 && $character <= 0x7F
|| $length > 2 && $character <= 0x7FF
|| $length > 3 && $character <= 0xFFFF
// Outside of range of ucschar codepoints
|| ($character & 0xFFFE) === 0xFFFE
|| $character >= 0xFDD0 && $character <= 0xFDEF
// Everything else not in ucschar
$character > 0xD7FF && $character < 0xF900
// Everything not in iprivate, if it applies
// If we were a character, pretend we weren't, but rather an error.
for ($j = $start; $j <= $position; $j++) {
$text = substr_replace($text, sprintf('%%%02X', ord($text[$j])), $j, 1);
* Callback function for preg_replace_callback.
* Removes sequences of percent encoded bytes that represent UTF-8
* encoded characters in iunreserved
* @param array $regex_match PCRE match
* @return string Replacement
protected function remove_iunreserved_percent_encoded($regex_match) {
// As we just have valid percent encoded sequences we can just explode
// and ignore the first member of the returned array (an empty string).
$bytes = explode('%', $regex_match[0]);
// Initialize the new string (this is what will be returned) and that
// there are no bytes remaining in the current sequence (unsurprising
// Loop over each and every byte, and set $value to its value
for ($i = 1, $len = count($bytes); $i < $len; $i++) {
$value = hexdec($bytes[$i]);
// If we're the first byte of sequence:
// By default we are valid
elseif (($value & 0xE0) === 0xC0) {
$character = ($value & 0x1F) << 6;
elseif (($value & 0xF0) === 0xE0) {
$character = ($value & 0x0F) << 12;
elseif (($value & 0xF8) === 0xF0) {
$character = ($value & 0x07) << 18;
// Check that the byte is valid, then add it to the character:
if (($value & 0xC0) === 0x80) {
$character |= ($value & 0x3F) << ($remaining * 6);
// If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
// If we've reached the end of the current byte sequence, append it to Unicode::$data
// Percent encode anything invalid or not in iunreserved
// Non-shortest form sequences are invalid
|| $length > 1 && $character <= 0x7F
|| $length > 2 && $character <= 0x7FF
|| $length > 3 && $character <= 0xFFFF
// Outside of range of iunreserved codepoints
|| ($character & 0xFFFE) === 0xFFFE
|| $character >= 0xFDD0 && $character <= 0xFDEF
// Everything else not in iunreserved (this is all BMP)
|| $character > 0x39 && $character < 0x41
|| $character > 0x5A && $character < 0x61
|| $character > 0x7A && $character < 0x7E
|| $character > 0x7E && $character < 0xA0
|| $character > 0xD7FF && $character < 0xF900
for ($j = $start; $j <= $i; $j++) {
$string .= '%' . strtoupper($bytes[$j]);
for ($j = $start; $j <= $i; $j++) {
$string .= chr(hexdec($bytes[$j]));
// If we have any bytes left over they are invalid (i.e., we are
// mid-way through a multi-byte sequence)
for ($j = $start; $j < $len; $j++) {
$string .= '%' . strtoupper($bytes[$j]);
protected function scheme_normalization() {
if (isset($this->normalization[$this->scheme]['iuserinfo']) && $this->iuserinfo === $this->normalization[$this->scheme]['iuserinfo']) {
if (isset($this->normalization[$this->scheme]['ihost']) && $this->ihost === $this->normalization[$this->scheme]['ihost']) {
if (isset($this->normalization[$this->scheme]['port']) && $this->port === $this->normalization[$this->scheme]['port']) {
if (isset($this->normalization[$this->scheme]['ipath']) && $this->ipath === $this->normalization[$this->scheme]['ipath']) {
if (isset($this->ihost) && empty($this->ipath)) {
if (isset($this->normalization[$this->scheme]['iquery']) && $this->iquery === $this->normalization[$this->scheme]['iquery']) {
if (isset($this->normalization[$this->scheme]['ifragment']) && $this->ifragment === $this->normalization[$this->scheme]['ifragment']) {
* Check if the object represents a valid IRI. This needs to be done on each
* call as some things change depending on another part of the IRI.
public function is_valid() {
$isauthority = $this->iuserinfo !== null || $this->ihost !== null || $this->port !== null;
if ($this->ipath !== '' &&
$isauthority && $this->ipath[0] !== '/' ||
$this->scheme === null &&
strpos($this->ipath, ':') !== false &&
(strpos($this->ipath, '/') === false ? true : strpos($this->ipath, ':') < strpos($this->ipath, '/'))
public function __wakeup() {
$class_props = get_class_vars( __CLASS__ );
$string_props = array( 'scheme', 'iuserinfo', 'ihost', 'port', 'ipath', 'iquery', 'ifragment' );
$array_props = array( 'normalization' );
foreach ( $class_props as $prop => $default_value ) {
if ( in_array( $prop, $string_props, true ) && ! is_string( $this->$prop ) ) {
throw new UnexpectedValueException();
} elseif ( in_array( $prop, $array_props, true ) && ! is_array( $this->$prop ) ) {
throw new UnexpectedValueException();
* Set the entire IRI. Returns true on success, false on failure (if there
* are any invalid characters).
protected function set_iri($iri) {
if (isset($cache[$iri])) {
$parsed = $this->parse_iri($iri);
$return = $this->set_scheme($parsed['scheme'])
&& $this->set_authority($parsed['authority'])
&& $this->set_path($parsed['path'])
&& $this->set_query($parsed['query'])
&& $this->set_fragment($parsed['fragment']);
$cache[$iri] = array($this->scheme,
* Set the scheme. Returns true on success, false on failure (if there are
* any invalid characters).
protected function set_scheme($scheme) {
elseif (!preg_match('/^[A-Za-z][0-9A-Za-z+\-.]*$/', $scheme)) {
$this->scheme = strtolower($scheme);
* Set the authority. Returns true on success, false on failure (if there are
* any invalid characters).
* @param string $authority
protected function set_authority($authority) {
if ($authority === null) {
if (isset($cache[$authority])) {
$return) = $cache[$authority];
if (($iuserinfo_end = strrpos($remaining, '@')) !== false) {
$iuserinfo = substr($remaining, 0, $iuserinfo_end);
$remaining = substr($remaining, $iuserinfo_end + 1);
if (($port_start = strpos($remaining, ':', (strpos($remaining, ']') ?: 0))) !== false) {
$port = substr($remaining, $port_start + 1);
if ($port === false || $port === '') {
$remaining = substr($remaining, 0, $port_start);
$return = $this->set_userinfo($iuserinfo) &&
$this->set_host($remaining) &&
$cache[$authority] = array($this->iuserinfo,
* @param string $iuserinfo
protected function set_userinfo($iuserinfo) {
if ($iuserinfo === null) {
$this->iuserinfo = $this->replace_invalid_with_pct_encoding($iuserinfo, '!$&\'()*+,;=:');
$this->scheme_normalization();
* Set the ihost. Returns true on success, false on failure (if there are
* any invalid characters).
protected function set_host($ihost) {
if (substr($ihost, 0, 1) === '[' && substr($ihost, -1) === ']') {
if (Ipv6::check_ipv6(substr($ihost, 1, -1))) {
$this->ihost = '[' . Ipv6::compress(substr($ihost, 1, -1)) . ']';
$ihost = $this->replace_invalid_with_pct_encoding($ihost, '!$&\'()*+,;=');
// Lowercase, but ignore pct-encoded sections (as they should
// remain uppercase). This must be done after the previous step
// as that can add unescaped characters.
$strlen = strlen($ihost);
while (($position += strcspn($ihost, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ%', $position)) < $strlen) {
if ($ihost[$position] === '%') {
$ihost[$position] = strtolower($ihost[$position]);
$this->scheme_normalization();
* Set the port. Returns true on success, false on failure (if there are
* any invalid characters).
protected function set_port($port) {
if (strspn($port, '0123456789') === strlen($port)) {
$this->port = (int) $port;
$this->scheme_normalization();
protected function set_path($ipath) {
$ipath = (string) $ipath;
if (isset($cache[$ipath])) {
$this->ipath = $cache[$ipath][(int) ($this->scheme !== null)];
$valid = $this->replace_invalid_with_pct_encoding($ipath, '!$&\'()*+,;=@:/');
$removed = $this->remove_dot_segments($valid);
$cache[$ipath] = array($valid, $removed);
$this->ipath = ($this->scheme !== null) ? $removed : $valid;
$this->scheme_normalization();
protected function set_query($iquery) {
$this->iquery = $this->replace_invalid_with_pct_encoding($iquery, '!$&\'()*+,;=:@/?', true);
$this->scheme_normalization();