Welcome to collectivesolver - Programming & Software Q&A. A website you can trust. All programs tested and works. Contact: aviboots(AT)netvision.net.il
BlueHost Web Hosting

Bonsai Boy of New York Bonsai Trees and Accessories

Liquid Web iThemes WordPress Hosting

Ecommerce Software - Best Ecommerce Platform Made for You - Free Trial

DreamHost Web Hosting


getResponse autofunnels

Liquid Web Cloud VPS Hosting

Disclosure We are a professional Information website that receives compensation from some of the links whose we show you with Information. It means that my content may contain affiliate links.

12,111 questions

16,499 answers

573 users

How to extract all the URLs from a web page in PHP

Online Web Development & Programming Courses | Udemy
20 views
asked Sep 14, 2019 by avibootz
edited Sep 15, 2019 by avibootz

3 Answers

0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }

    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }

    extract(parse_url($base_url)); // parse $scheme, $host, $path

    $path = preg_replace('#/[^/]*$#', '', $path);

    if ($relative_url[0] == '/') { 
        $path = '';
    }

    $absolute_url = "$host$path/$relative_url";

    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}

    return $scheme.'://'.$absolute_url;
}

function get_html($url) {
    $handle = curl_init();
             
    curl_setopt($handle, CURLOPT_HTTPGET, true);
    curl_setopt($handle, CURLOPT_HEADER, true);
    curl_setopt($handle, CURLOPT_URL, $url);
    curl_setopt($handle, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
             
    $output = curl_exec($handle);
     
    curl_close($handle);
             
    $separator = "\r\n\r\n";
    $header = substr($output, 0, strpos($output, $separator));
        
    $body_start = strlen($header) + strlen($separator);
    $html = substr($output, $body_start, strlen($output) - $body_start);
         
    return $html;
}
     
     
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    
$html = get_html($url);
  
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html);
  
$link_tags = $doc->getElementsByTagName('a');
foreach ($link_tags as $linktag) {
    if (($href = $linktag->getAttribute('href'))) {
	     $link_absolute = relative_path_to_absolute_url($href, $base_url);
         echo $link_absolute . "<br />";
    }
}


    
/*
run:
    
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
     
*/

 




answered Sep 14, 2019 by avibootz
edited Sep 14, 2019 by avibootz
0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }

    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }

    extract(parse_url($base_url)); // parse $scheme, $host, $path

    $path = preg_replace('#/[^/]*$#', '', $path);

    if ($relative_url[0] == '/') { 
        $path = '';
    }

    $absolute_url = "$host$path/$relative_url";

    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}

    return $scheme.'://'.$absolute_url;
}

     
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    

$content = file_get_contents($url);

$tags = strip_tags($content, "<a>");
$tags_array = preg_split("/<\/a>/", $tags);

foreach ($tags_array as $a_tag) {
    if (strpos($a_tag, "<a href=") !== false ){
        $a_tag = preg_replace("/.*<a\s+href=\"/sm","", $a_tag);
        $href = preg_replace("/\".*/","", $a_tag);
        $link_absolute = relative_path_to_absolute_url($href, $base_url);
        echo $link_absolute . "<br />";
 }
}


    
/*
run:
    
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
     
*/

 




answered Sep 14, 2019 by avibootz
edited Sep 14, 2019 by avibootz
0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }
 
    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }
 
    extract(parse_url($base_url)); // parse $scheme, $host, $path
 
    $path = preg_replace('#/[^/]*$#', '', $path);
 
    if ($relative_url[0] == '/') { 
        $path = '';
    }
 
    $absolute_url = "$host$path/$relative_url";
 
    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
     
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}
 
    return $scheme.'://'.$absolute_url;
}
 
      
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    
 
$html = file_get_contents($url);
 
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");

for ($i = 0; $i < $hrefs->length; $i++) {
    $href = $hrefs->item($i);
    $url = $href->getAttribute('href');
    $url = filter_var($url, FILTER_SANITIZE_URL);
    $link_absolute = relative_path_to_absolute_url($url, $base_url);
    
    if (!filter_var($link_absolute, FILTER_VALIDATE_URL) === false) {
         echo $link_absolute . '<br />';
    }
}



/*
run:
     
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
      
*/

 




answered Sep 14, 2019 by avibootz
...