Pages

Monday, 20 August 2012

A simple HTTP PHP class to crawl a URL for internal and external URLs


Here's a simple PHP class I wrote to crawl a URL and return a list of internal and external URLs. I've used it in the past for development purposes [only] to find 404s and repetition in URL structure. IE: It does not read in robots.txt files or obey any similar rules. Just thought I'd pull it out of the archives and share on the web..

#!/usr/bin/php

<?php
class Crawl {

protected $regex_link;
protected $website_url;
protected $website_url_base;
protected $urls_processed;
protected $urls_external;
protected $urls_not_processed;
protected $urls_ignored;

public function __construct($website_url = NULL) {

// enable error tracking, grr.
ini_set
('track_errors', true);

// setup variables
$this
->regex_link = "/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]/isU";
$this
->urls_processed = array();
$this
->urls_external = array();
$this
->urls_not_processed = array();
$this
->urls_ignored = array(
'/search/apachesolr_search/',
'/comment/reply/',
);

// validate argument(s)
$result
= $this->validate_arg_website_url($website_url);

// error check
if (!$result) {
return FALSE;
}

// set website argument
$this
->website_url = $website_url;

// get url base
$url_base
= $this->get_url_base($this->website_url);

// error check
if (!$url_base) {
return;
}

// set website url base
$this
->website_url_base = $url_base;

// add url to list of urls to process
$this
->urls_not_processed[] = $this->website_url;

while(count($this->urls_not_processed)) {
$this
->process_urls_not_processed();
}

// sort data
sort
($this->urls_processed);
sort
($this->urls_external);

}

protected function validate_arg_website_url($website_url = NULL) {

// validate argument
if (!(is_string($website_url) && (substr($website_url,0,7)=='http://' || substr($website_url,0,8)=='https://'))) {
return FALSE;
}

return TRUE;

}

protected function get_url_base($url = NULL) {

// validate url
if (!$url || !strlen($url)) {
return FALSE;
}

$url_parts
= parse_url($url);

// validate
if (!is_array($url_parts)) {
return FALSE;
}

// explode host on '.'
$exploded
= explode('.', $url_parts['host']);

// return host and domain extension
$url_base
= $exploded[count($exploded)-2] . '.' . $exploded[count($exploded)-1];


return $url_base;

}

protected function scan_url($url) {

// validate url
if (!is_string($url) || !$url || !strlen($url)) {
return FALSE;
}

// ensure url has not already been processed
if (in_array($url, $this->urls_processed)) {
return FALSE;
}

// add url to processed list
$this
->urls_processed[] = $url;

// remove any previously saved errors
unset
($php_errormsg);

// load page contents
$page_contents
= file_get_contents($url);

// check for error when loading url; text starting with "file_get_contents"
$error_text
= 'file_get_contents';
if (isset($php_errormsg) && substr($php_errormsg,0,strlen($error_text))==$error_text) {
return FALSE;
}

// check for additional errors
elseif
($page_contents === false || !strlen($page_contents)) {
return FALSE;
}

// execute regex
preg_match_all
($this->regex_link, $page_contents, $matches);

if (is_array($matches) && isset($matches[1])) {
return array_unique($matches[1]);
}

return FALSE;

}

protected function process_matches($matches = NULL) {

// validate
if (!$matches || !is_array($matches) || empty($matches)) {
return FALSE;
}

foreach ($matches as $match) {

// ensure match exists
if (empty($match)) {
continue;
}
// ignore anchors
elseif
(substr($match,0,1)=='#') {
continue;
}
// ignore javascript
elseif
(substr($match,0,11)=='javascript:') {
continue;
}
// ignore mailto
elseif
(substr($match,0,7)=='mailto:') {
continue;
}

// check for internal urls that begin with '/'
if (substr($match,0,1)=='/') {
$match
= 'http://' . $this->website_url_base . $match;
}

// remove trailing slash
if (substr($match, -1)=='/') {
$match
= substr($match, 0, -1);
}

// ensure href starts with http or https
// NOTE: this needs work, URL could begin with relative paths like '../', ftp://, etc.
if (!(substr($match,0,7)=='http://' || substr($match,0,8)=='https://')) {
$match
= 'http://' . $this->website_url_base . '/' . $match;
}

// check if url is to be ignored
foreach ($this->urls_ignored as $ignored) {
if (stripos($match, $ignored) !== FALSE) {
continue 2;
}
}

// get url base
$url_base
= $this->get_url_base($match);

// check for external url
if ($url_base != $this->website_url_base) {

if (!in_array($match, $this->urls_external)) {
$this
->urls_external[] = $match;
}
continue;

}

// check if url has already been processed
if (in_array($match, $this->urls_processed)) {
continue;
}

// add url to list of urls to process
if (!in_array($match, $this->urls_not_processed)) {
$this
->urls_not_processed[] = $match;
}

// end: foreach
}

return TRUE;

}

protected function process_urls_not_processed() {

if (empty($this->urls_not_processed)) {
return FALSE;
}

// get unprocessed url
$url
= array_shift($this->urls_not_processed);

// scan url
$matches
= $this->scan_url($url);

// error check
if (!$matches || !is_array($matches) || empty($matches)) {
return FALSE;
}

$this
->process_matches($matches);

}

public function output_all_urls() {

echo
"===== INTERNAL URLS =====\n";
foreach ($this->urls_processed as $url) {
print $url . "\n";
}

echo
"===== EXTERNAL URLS =====\n";
foreach ($this->urls_external as $url) {
print $url . "\n";
}

}

}
?>


It can be used as such..
<?php
$website_url
= 'http://www.example.com';
$crawl
= new Crawl($website_url);
$crawl
->output_all_urls();
?>

No comments:

Post a Comment