url; } function get_data($data_id) { $this->redirects = 0; if ($this->fetch($data_id)) { if ($this->code != HTTP_OK) { $_server_response = $this->headers; $_http_error = $this->code; $_url = htmlspecialchars($data_id); ob_start(); include('templates/error._http.tpl'); $this->error_message .= ob_get_contents(); ob_end_clean(); error_log("Cannot open $data_id, HTTP result code is: ".$this->code); return null; }; return new FetchedDataURL($this->content, explode("\r\n",$this->headers), $this->url); } elseif ($this->redirects > MAX_REDIRECTS) { $_server_response = $this->headers; $_url = htmlspecialchars($data_id); ob_start(); include('templates/error._redirects.tpl'); $this->error_message .= ob_get_contents(); ob_end_clean(); error_log(sprintf("Cannot open %s, too many redirects", $data_id)); return null; } else { $_server_response = $this->headers; $_url = htmlspecialchars($data_id); ob_start(); include('templates/error._connection.tpl'); $this->error_message .= ob_get_contents(); ob_end_clean(); error_log(sprintf("Cannot open %s", $data_id)); return null; } } function error_message() { return $this->error_message; } // FetcherURL - constructor function FetcherURL() { $this->_connections = array(); $this->error_message = ""; $this->redirects = 0; $this->port = 80; // Default encoding // $this->encoding = "iso-8859-1"; $this->user_agent = DEFAULT_USER_AGENT; } // --------------------------------------------- // FetcherURL - PRIVATE methods // --------------------------------------------- /** * Connects to the target host using either HTTP or HTTPS protocol; * returns handle to connection socked or 'null' in case connection failed. * * @access private * @final * @return resource */ function _connect() { // Connect to the target host if ($this->protocol == "https") { return $this->_connect_ssl(); }; $fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT); if (!$fp) { $message = sprintf("Cannot connect to %s:%d - (%d) %s", $this->host, $this->port, $errno, $errstr); error_log($message); $this->error_message = $message; return null; }; return $fp; } function _connect_ssl() { /** * Check if there's SSL support library loaded * * Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box) * openssl extension IS present, but fsockopen still complains "No SSL support in this build". * (probably PHP bug?) */ if (!extension_loaded('openssl')) { $message = sprintf("Cannot connect to %s:%d. SSL Extension missing", $this->host, $this->port); error_log($message); $this->error_message .= $message; return null; }; $fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5); if (!$fp) { $message = sprintf("Cannot connect to %s:%d - (%d) %s
Missing SSL support?", $this->host, $this->port, $errno, $errstr); error_log($message); $this->error_message = $message; return null; }; return $fp; } function _extract_code($res) { // Check return code // Note the return code will always be contained in the response, so // the we may not check the result of 'preg_match' - it matches always. // // A month later: nope, not always. // if (preg_match('/\s(\d+)\s/',$res,$matches)) { $result = $matches[1]; } else { $result = "200"; }; return $result; } function _fix_location($location) { if (substr($location, 0, 7) == "http://") { return $location; }; if (substr($location, 0, 8) == "https://") { return $location; }; if ($location{0} == "/") { return $this->protocol."://".$this->host.$location; }; return $this->protocol."://".$this->host.$this->path.$location; } function fetch($url) { /** * Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid * URL, so fetcher will attempt to fetch something from the localhost instead of * passing control to subsequent user-defined fetchers (which probably will know * how to handle this). */ if ($url === "") { return null; } $this->url = $url; $parts = @parse_url($this->url); /** * If an malformed URL have been specified, add a message to the log file and * continue processing (as such URLs may be found in otherwise good HTML file - * for example, invalid image or CSS reference) */ if ($parts == false) { error_log(sprintf("The URL '%s' could not be parsed", $this->url)); $this->content = ''; $this->code = HTTP_OK; return true; }; /** * Setup default values */ $this->protocol = 'http'; $this->host = 'localhost'; $this->user = ""; $this->pass = ""; $this->port = 80; $this->path = "/"; $this->query = ""; if (isset($parts['scheme'])) { $this->protocol = $parts['scheme']; }; if (isset($parts['host'])) { $this->host = $parts['host']; }; if (isset($parts['user'])) { $this->user = $parts['user']; }; if (isset($parts['pass'])) { $this->pass = $parts['pass']; }; if (isset($parts['port'])) { $this->port = $parts['port']; }; if (isset($parts['path'])) { $this->path = $parts['path']; } else { $this->path = "/"; }; if (isset($parts['query'])) { $this->path .= '?'.$parts['query']; }; switch (strtolower($this->protocol)) { case 'http': return $this->fetch_http(); case 'https': return $this->fetch_https(); case 'file': $this->host = ""; return $this->fetch_file(); default: $message = sprintf("Unsupported protocol: %s", $this->protocol); error_log($message); $this->error_message .= $message; return null; } } function fetch_http() { $res = $this->_head(); if (is_null($res)) { return null; }; $this->code = $this->_extract_code($res); return $this->_process_code($res); } function fetch_https() { /** * SSL works via port 443 */ if ($this->protocol == "https" && !isset($parts['port'])) { $this->port = 443; } $res = $this->_head(); if (is_null($res)) { return null; }; $this->code = $this->_extract_code($res); return $this->_process_code($res); } function fetch_file() { if (PHP_OS == "WINNT") { $path = substr($this->url, 7); if ($path{0} == "/") { $path = substr($path, 1); }; } else { $path = substr($this->url, 7); }; $normalized_path = realpath(urldecode($path)); $normalized_path_part = substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT)); if ($normalized_path_part !== FILE_PROTOCOL_RESTRICT) { error_log(sprintf("Access denied to file '%s'", $normalized_path)); $this->content = ""; $this->code = HTTP_OK; return true; } $this->content = @file_get_contents($normalized_path); $this->code = HTTP_OK; return true; } function _get() { $socket = $this->_connect(); if (is_null($socket)) { return null; }; // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents) $header = "GET ".$this->path." HTTP/1.1\r\n"; $header .= "Host: ".$this->host."\r\n"; $header .= "Accept: */*\r\n"; $header .= "User-Agent: ".$this->user_agent."\r\n"; $header .= "Connection: keep-alive\r\n"; $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n"; $header .= $this->_header_basic_authorization(); $header .= "\r\n"; fputs ($socket, $header); // Get the responce $res = ""; // The PHP-recommended construction // while (!feof($fp)) { $res .= fread($fp, 4096); }; // hangs indefinitely on www.searchscout.com, for example. // seems that they do not close conection on their side or somewhat similar; // let's assume that there will be no HTML pages greater than 1 Mb $res = fread($socket, 1024*1024); // Close connection handle, we do not need it anymore fclose($socket); return $res; } function _head() { $socket = $this->_connect(); if (is_null($socket)) { return null; }; // Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents) $header = "HEAD ".$this->path." HTTP/1.1\r\n"; $header .= "Host: ".$this->host."\r\n"; $header .= "Accept: */*\r\n"; $header .= "User-Agent: ".$this->user_agent."\r\n"; $header .= "Connection: keep-alive\r\n"; $header .= "Accept: text/html\r\n"; $header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n"; $header .= $this->_header_basic_authorization(); $header .= "\r\n"; // Send the header fputs ($socket, $header); // Get the responce $res = ""; // The PHP-recommended construction // while (!feof($fp)) { $res .= fread($fp, 4096); }; // hangs indefinitely on www.searchscout.com, for example. // seems that they do not close conection on their side or somewhat similar; // let's assume that there will be no HTML pages greater than 1 Mb $res = fread($socket, 4096); // Close connection handle, we do not need it anymore fclose($socket); return $res; } function _process_code($res, $used_get = false) { switch ($this->code) { case '200': // OK if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) { $this->headers = $matches[1]; }; /** * @todo add error processing here * * Note: file_get_contents is smart enough to use basic authorization headers provided * user name / password are given in the URL. */ $this->content = @file_get_contents($this->url); return true; break; case '301': // Moved Permanently $this->redirects++; if ($this->redirects > MAX_REDIRECTS) { return false; }; preg_match('/Location: ([\S]+)/i',$res,$matches); return $this->fetch($this->_fix_location($matches[1])); case '302': // Found $this->redirects++; if ($this->redirects > MAX_REDIRECTS) { return false; }; preg_match('/Location: ([\S]+)/i',$res,$matches); error_log('Redirected to:'.$matches[1]); return $this->fetch($this->_fix_location($matches[1])); case '400': // Bad request case '401': // Unauthorized case '402': // Payment required case '403': // Forbidden case '404': // Not found - but should return some html content - error page case '406': // Not acceptable if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) { error_log("Unrecognized HTTP response"); return false; }; $this->headers = $matches[1]; $this->content = @file_get_contents($this->url); return true; case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests // Try to get URL information using GET request (if we didn't tried it before) if (!$used_get) { $res = $this->_get(); if (is_null($res)) { return null; }; $this->code = $this->_extract_code($res); return $this->_process_code($res, true); } else { if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) { error_log("Unrecognized HTTP response"); return false; }; $this->headers = $matches[1]; $this->content = @file_get_contents($this->url); return true; }; default: error_log("Unrecognized HTTP result code:".$this->code); return false; }; } function _header_basic_authorization() { if (!is_null($this->user) && $this->user != "") { return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass)); }; } } ?>