How to create website crawler in PHP

1 Answer

0 votes
class website_crawler
{
    protected $_url;
    protected $_base_url;
    protected $_depth;
    protected $_host;
    protected $_seen = array();

    public function __construct($url, $depth = 3) {
        $this->_url = $url;
        $this->_base_url = $url;
        $this->_depth = $depth;
        $parse = parse_url($url);
        $this->_host = $parse['host'];
    }
    
    protected function relative_path_to_absolute_url($relative_url) {
        if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
            return $relative_url;
        }

        if ($relative_url[0] == '?' || $relative_url[0] == '#') { 
            return $this->_base_url.$relative_url;
        }

        extract(parse_url($this->_base_url)); // parse $scheme, $host, $path

        $path = preg_replace('#/[^/]*$#', '', $path);

        if ($relative_url[0] == '/') { 
            $path = '';
        }

        $absolute_url = "$host$path/$relative_url";

        $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');

        for ($i = 1; $i > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $i)) {}

        return $scheme.'://'.$absolute_url;
    }
    
    protected function get_href($content, $url, $depth) {
        $dom = new DOMDocument('1.0');
        @$dom->loadHTML($content);
        
        $anchors = $dom->getElementsByTagName('a');
        foreach ($anchors as $element) {
            $href = $element->getAttribute('href');
            $link_absolute = $this->relative_path_to_absolute_url($href);
            $this->crawl_page($link_absolute, $depth - 1);
        }
    }

    protected function get_content($url) {
        $handle = curl_init($url);
        curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
        $content = curl_exec($handle);
        $http_code = curl_getinfo($handle, CURLINFO_HTTP_CODE);
        curl_close($handle);
        
        return array($content, $http_code);
    }

    protected function print_urls($url) {
        ob_end_clean();
        echo $url . "<br />";
        ob_start();
        flush();
    }

    protected function check_error($url, $depth) {
        if (strpos($url, $this->_host) === false
            || $depth === 0
            || isset($this->_seen[$url])) {
            return false;
        }
        return true;
    }

    public function crawl_page($url, $depth) {
        if (!$this->check_error($url, $depth)) {
            return;
        }
        $this->_seen[$url] = true;
        list($content, $http_code) = $this->get_content($url);
        if ($http_code === 200) {
            $this->print_urls($url);
        }
        $this->get_href($content, $url, $depth);
    }
    public function run() {
        $this->crawl_page($this->_url, $this->_depth);
    }
}

 
$url = 'https://www.collectivesolver.com';
$depth = 2;
$crawler = new website_crawler($url, $depth);
$crawler->run();

echo "<br /> END <br />";
   

     
     
/*
run:
     
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
      
*/

 



answered Sep 15, 2019 by avibootz
edited Sep 17, 2019 by avibootz

Related questions

2 answers 248 views
3 answers 291 views
291 views asked Feb 13, 2021 by avibootz
1 answer 385 views
1 answer 211 views
1 answer 159 views
...