class website_crawler
{
protected $_url;
protected $_base_url;
protected $_depth;
protected $_host;
protected $_seen = array();
public function __construct($url, $depth = 3) {
$this->_url = $url;
$this->_base_url = $url;
$this->_depth = $depth;
$parse = parse_url($url);
$this->_host = $parse['host'];
}
protected function relative_path_to_absolute_url($relative_url) {
if (parse_url($relative_url, PHP_URL_SCHEME) != '') {
return $relative_url;
}
if ($relative_url[0] == '?' || $relative_url[0] == '#') {
return $this->_base_url.$relative_url;
}
extract(parse_url($this->_base_url)); // parse $scheme, $host, $path
$path = preg_replace('#/[^/]*$#', '', $path);
if ($relative_url[0] == '/') {
$path = '';
}
$absolute_url = "$host$path/$relative_url";
$arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
for ($i = 1; $i > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $i)) {}
return $scheme.'://'.$absolute_url;
}
protected function get_href($content, $url, $depth) {
$dom = new DOMDocument('1.0');
@$dom->loadHTML($content);
$anchors = $dom->getElementsByTagName('a');
foreach ($anchors as $element) {
$href = $element->getAttribute('href');
$link_absolute = $this->relative_path_to_absolute_url($href);
$this->crawl_page($link_absolute, $depth - 1);
}
}
protected function get_content($url) {
$handle = curl_init($url);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
$content = curl_exec($handle);
$http_code = curl_getinfo($handle, CURLINFO_HTTP_CODE);
curl_close($handle);
return array($content, $http_code);
}
protected function print_urls($url) {
ob_end_clean();
echo $url . "<br />";
ob_start();
flush();
}
protected function check_error($url, $depth) {
if (strpos($url, $this->_host) === false
|| $depth === 0
|| isset($this->_seen[$url])) {
return false;
}
return true;
}
public function crawl_page($url, $depth) {
if (!$this->check_error($url, $depth)) {
return;
}
$this->_seen[$url] = true;
list($content, $http_code) = $this->get_content($url);
if ($http_code === 200) {
$this->print_urls($url);
}
$this->get_href($content, $url, $depth);
}
public function run() {
$this->crawl_page($this->_url, $this->_depth);
}
}
$url = 'https://www.collectivesolver.com';
$depth = 2;
$crawler = new website_crawler($url, $depth);
$crawler->run();
echo "<br /> END <br />";
/*
run:
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
*/