url;
}
function get_data($data_id) {
$this->redirects = 0;
if ($this->fetch($data_id)) {
if ($this->code != HTTP_OK) {
$_server_response = $this->headers;
$_http_error = $this->code;
$_url = htmlspecialchars($data_id);
ob_start();
include('templates/error._http.tpl');
$this->error_message .= ob_get_contents();
ob_end_clean();
error_log("Cannot open $data_id, HTTP result code is: ".$this->code);
return null;
};
return new FetchedDataURL($this->content,
explode("\r\n",$this->headers),
$this->url);
} elseif ($this->redirects > MAX_REDIRECTS) {
$_server_response = $this->headers;
$_url = htmlspecialchars($data_id);
ob_start();
include('templates/error._redirects.tpl');
$this->error_message .= ob_get_contents();
ob_end_clean();
error_log(sprintf("Cannot open %s, too many redirects",
$data_id));
return null;
} else {
$_server_response = $this->headers;
$_url = htmlspecialchars($data_id);
ob_start();
include('templates/error._connection.tpl');
$this->error_message .= ob_get_contents();
ob_end_clean();
error_log(sprintf("Cannot open %s",
$data_id));
return null;
}
}
function error_message() {
return $this->error_message;
}
// FetcherURL - constructor
function FetcherURL() {
$this->_connections = array();
$this->error_message = "";
$this->redirects = 0;
$this->port = 80;
// Default encoding
// $this->encoding = "iso-8859-1";
$this->user_agent = DEFAULT_USER_AGENT;
}
// ---------------------------------------------
// FetcherURL - PRIVATE methods
// ---------------------------------------------
/**
* Connects to the target host using either HTTP or HTTPS protocol;
* returns handle to connection socked or 'null' in case connection failed.
*
* @access private
* @final
* @return resource
*/
function _connect() {
// Connect to the target host
if ($this->protocol == "https") {
return $this->_connect_ssl();
};
$fp = @fsockopen($this->host,$this->port,$errno,$errstr,HTML2PS_CONNECTION_TIMEOUT);
if (!$fp) {
$message = sprintf("Cannot connect to %s:%d - (%d) %s",
$this->host,
$this->port,
$errno,
$errstr);
error_log($message);
$this->error_message = $message;
return null;
};
return $fp;
}
function _connect_ssl() {
/**
* Check if there's SSL support library loaded
*
* Note that in certain situations (e.g. Windows + PHP 4.4.0 + Apache 2 on my development box)
* openssl extension IS present, but fsockopen still complains "No SSL support in this build".
* (probably PHP bug?)
*/
if (!extension_loaded('openssl')) {
$message = sprintf("Cannot connect to %s:%d. SSL Extension missing",
$this->host,
$this->port);
error_log($message);
$this->error_message .= $message;
return null;
};
$fp = @fsockopen("ssl://$this->host", $this->port, $errno, $errstr, 5);
if (!$fp) {
$message = sprintf("Cannot connect to %s:%d - (%d) %s
Missing SSL support?",
$this->host,
$this->port,
$errno,
$errstr);
error_log($message);
$this->error_message = $message;
return null;
};
return $fp;
}
function _extract_code($res) {
// Check return code
// Note the return code will always be contained in the response, so
// the we may not check the result of 'preg_match' - it matches always.
//
// A month later: nope, not always.
//
if (preg_match('/\s(\d+)\s/',$res,$matches)) {
$result = $matches[1];
} else {
$result = "200";
};
return $result;
}
function _fix_location($location) {
if (substr($location, 0, 7) == "http://") { return $location; };
if (substr($location, 0, 8) == "https://") { return $location; };
if ($location{0} == "/") {
return $this->protocol."://".$this->host.$location;
};
return $this->protocol."://".$this->host.$this->path.$location;
}
function fetch($url) {
/**
* Handle empty $url value; unfortunaltely, parse_url will treat empty value as valid
* URL, so fetcher will attempt to fetch something from the localhost instead of
* passing control to subsequent user-defined fetchers (which probably will know
* how to handle this).
*/
if ($url === "") {
return null;
}
$this->url = $url;
$parts = @parse_url($this->url);
/**
* If an malformed URL have been specified, add a message to the log file and
* continue processing (as such URLs may be found in otherwise good HTML file -
* for example, invalid image or CSS reference)
*/
if ($parts == false) {
error_log(sprintf("The URL '%s' could not be parsed", $this->url));
$this->content = '';
$this->code = HTTP_OK;
return true;
};
/**
* Setup default values
*/
$this->protocol = 'http';
$this->host = 'localhost';
$this->user = "";
$this->pass = "";
$this->port = 80;
$this->path = "/";
$this->query = "";
if (isset($parts['scheme'])) { $this->protocol = $parts['scheme']; };
if (isset($parts['host'])) { $this->host = $parts['host']; };
if (isset($parts['user'])) { $this->user = $parts['user']; };
if (isset($parts['pass'])) { $this->pass = $parts['pass']; };
if (isset($parts['port'])) { $this->port = $parts['port']; };
if (isset($parts['path'])) { $this->path = $parts['path']; } else { $this->path = "/"; };
if (isset($parts['query'])) { $this->path .= '?'.$parts['query']; };
switch (strtolower($this->protocol)) {
case 'http':
return $this->fetch_http();
case 'https':
return $this->fetch_https();
case 'file':
$this->host = "";
return $this->fetch_file();
default:
$message = sprintf("Unsupported protocol: %s", $this->protocol);
error_log($message);
$this->error_message .= $message;
return null;
}
}
function fetch_http() {
$res = $this->_head();
if (is_null($res)) { return null; };
$this->code = $this->_extract_code($res);
return $this->_process_code($res);
}
function fetch_https() {
/**
* SSL works via port 443
*/
if ($this->protocol == "https" && !isset($parts['port'])) {
$this->port = 443;
}
$res = $this->_head();
if (is_null($res)) { return null; };
$this->code = $this->_extract_code($res);
return $this->_process_code($res);
}
function fetch_file() {
if (PHP_OS == "WINNT") {
$path = substr($this->url, 7);
if ($path{0} == "/") { $path = substr($path, 1); };
} else {
$path = substr($this->url, 7);
};
$normalized_path = realpath(urldecode($path));
$normalized_path_part = substr($normalized_path, 0, strlen(FILE_PROTOCOL_RESTRICT));
if ($normalized_path_part !== FILE_PROTOCOL_RESTRICT) {
error_log(sprintf("Access denied to file '%s'", $normalized_path));
$this->content = "";
$this->code = HTTP_OK;
return true;
}
$this->content = @file_get_contents($normalized_path);
$this->code = HTTP_OK;
return true;
}
function _get() {
$socket = $this->_connect();
if (is_null($socket)) { return null; };
// Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
$header = "GET ".$this->path." HTTP/1.1\r\n";
$header .= "Host: ".$this->host."\r\n";
$header .= "Accept: */*\r\n";
$header .= "User-Agent: ".$this->user_agent."\r\n";
$header .= "Connection: keep-alive\r\n";
$header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
$header .= $this->_header_basic_authorization();
$header .= "\r\n";
fputs ($socket, $header);
// Get the responce
$res = "";
// The PHP-recommended construction
// while (!feof($fp)) { $res .= fread($fp, 4096); };
// hangs indefinitely on www.searchscout.com, for example.
// seems that they do not close conection on their side or somewhat similar;
// let's assume that there will be no HTML pages greater than 1 Mb
$res = fread($socket, 1024*1024);
// Close connection handle, we do not need it anymore
fclose($socket);
return $res;
}
function _head() {
$socket = $this->_connect();
if (is_null($socket)) { return null; };
// Build the HEAD request header (we're saying we're just a browser as some pages don't like non-standard user-agents)
$header = "HEAD ".$this->path." HTTP/1.1\r\n";
$header .= "Host: ".$this->host."\r\n";
$header .= "Accept: */*\r\n";
$header .= "User-Agent: ".$this->user_agent."\r\n";
$header .= "Connection: keep-alive\r\n";
$header .= "Accept: text/html\r\n";
$header .= "Referer: ".$this->protocol."://".$this->host.$this->path."\r\n";
$header .= $this->_header_basic_authorization();
$header .= "\r\n";
// Send the header
fputs ($socket, $header);
// Get the responce
$res = "";
// The PHP-recommended construction
// while (!feof($fp)) { $res .= fread($fp, 4096); };
// hangs indefinitely on www.searchscout.com, for example.
// seems that they do not close conection on their side or somewhat similar;
// let's assume that there will be no HTML pages greater than 1 Mb
$res = fread($socket, 4096);
// Close connection handle, we do not need it anymore
fclose($socket);
return $res;
}
function _process_code($res, $used_get = false) {
switch ($this->code) {
case '200': // OK
if (preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
$this->headers = $matches[1];
};
/**
* @todo add error processing here
*
* Note: file_get_contents is smart enough to use basic authorization headers provided
* user name / password are given in the URL.
*/
$this->content = @file_get_contents($this->url);
return true;
break;
case '301': // Moved Permanently
$this->redirects++;
if ($this->redirects > MAX_REDIRECTS) { return false; };
preg_match('/Location: ([\S]+)/i',$res,$matches);
return $this->fetch($this->_fix_location($matches[1]));
case '302': // Found
$this->redirects++;
if ($this->redirects > MAX_REDIRECTS) { return false; };
preg_match('/Location: ([\S]+)/i',$res,$matches);
error_log('Redirected to:'.$matches[1]);
return $this->fetch($this->_fix_location($matches[1]));
case '400': // Bad request
case '401': // Unauthorized
case '402': // Payment required
case '403': // Forbidden
case '404': // Not found - but should return some html content - error page
case '406': // Not acceptable
if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
error_log("Unrecognized HTTP response");
return false;
};
$this->headers = $matches[1];
$this->content = @file_get_contents($this->url);
return true;
case '405': // Method not allowed; some sites (like MSN.COM) do not like "HEAD" HTTP requests
// Try to get URL information using GET request (if we didn't tried it before)
if (!$used_get) {
$res = $this->_get();
if (is_null($res)) { return null; };
$this->code = $this->_extract_code($res);
return $this->_process_code($res, true);
} else {
if (!preg_match('/(.*?)\r\n\r\n(.*)/s',$res,$matches)) {
error_log("Unrecognized HTTP response");
return false;
};
$this->headers = $matches[1];
$this->content = @file_get_contents($this->url);
return true;
};
default:
error_log("Unrecognized HTTP result code:".$this->code);
return false;
};
}
function _header_basic_authorization() {
if (!is_null($this->user) && $this->user != "") {
return sprintf("Authorization: Basic %s\r\n", base64_encode($this->user.":".$this->pass));
};
}
}
?>