Welcome to collectivesolver - Programming & Software Q&A. A website you can trust. All programs tested and works. Contact: aviboots(AT)netvision.net.il
BlueHost Web Hosting

Bonsai Boy of New York Bonsai Trees and Accessories

Website Domains Names & Hosting | Domain.com

Ecommerce Software - Best Ecommerce Platform Made for You - Free Trial

DreamHost Web Hosting


Instant Grammar Checker - Correct all grammar errors and enhance your writing

Liquid Web Cloud VPS Hosting

Disclosure We are a professional Programming & Software Q&A website, that receives compensation from some of the links whose we show you with Information. It means that my content may contain affiliate links.

12,685 questions

17,275 answers

573 users

How to extract all the URLs from a web page in PHP

Online Web Development & Programming Courses | Udemy
43 views
asked Sep 14, 2019 by avibootz
edited Sep 15, 2019 by avibootz

3 Answers

0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }

    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }

    extract(parse_url($base_url)); // parse $scheme, $host, $path

    $path = preg_replace('#/[^/]*$#', '', $path);

    if ($relative_url[0] == '/') { 
        $path = '';
    }

    $absolute_url = "$host$path/$relative_url";

    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}

    return $scheme.'://'.$absolute_url;
}

function get_html($url) {
    $handle = curl_init();
             
    curl_setopt($handle, CURLOPT_HTTPGET, true);
    curl_setopt($handle, CURLOPT_HEADER, true);
    curl_setopt($handle, CURLOPT_URL, $url);
    curl_setopt($handle, CURLOPT_FOLLOWLOCATION, true);
    curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
             
    $output = curl_exec($handle);
     
    curl_close($handle);
             
    $separator = "\r\n\r\n";
    $header = substr($output, 0, strpos($output, $separator));
        
    $body_start = strlen($header) + strlen($separator);
    $html = substr($output, $body_start, strlen($output) - $body_start);
         
    return $html;
}
     
     
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    
$html = get_html($url);
  
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html);
  
$link_tags = $doc->getElementsByTagName('a');
foreach ($link_tags as $linktag) {
    if (($href = $linktag->getAttribute('href'))) {
	     $link_absolute = relative_path_to_absolute_url($href, $base_url);
         echo $link_absolute . "<br />";
    }
}


    
/*
run:
    
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
     
*/

 




answered Sep 14, 2019 by avibootz
edited Sep 14, 2019 by avibootz
0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }

    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }

    extract(parse_url($base_url)); // parse $scheme, $host, $path

    $path = preg_replace('#/[^/]*$#', '', $path);

    if ($relative_url[0] == '/') { 
        $path = '';
    }

    $absolute_url = "$host$path/$relative_url";

    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
    
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}

    return $scheme.'://'.$absolute_url;
}

     
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    

$content = file_get_contents($url);

$tags = strip_tags($content, "<a>");
$tags_array = preg_split("/<\/a>/", $tags);

foreach ($tags_array as $a_tag) {
    if (strpos($a_tag, "<a href=") !== false ){
        $a_tag = preg_replace("/.*<a\s+href=\"/sm","", $a_tag);
        $href = preg_replace("/\".*/","", $a_tag);
        $link_absolute = relative_path_to_absolute_url($href, $base_url);
        echo $link_absolute . "<br />";
 }
}


    
/*
run:
    
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
     
*/

 




answered Sep 14, 2019 by avibootz
edited Sep 14, 2019 by avibootz
0 votes
function relative_path_to_absolute_url($relative_url, $base_url) {
    if (parse_url($relative_url, PHP_URL_SCHEME) != '') { 
        return $relative_url;
    }
 
    if ($relative_url[0] == '#' || $relative_url[0] == '?') { 
        return $base_url.$relative_url;
    }
 
    extract(parse_url($base_url)); // parse $scheme, $host, $path
 
    $path = preg_replace('#/[^/]*$#', '', $path);
 
    if ($relative_url[0] == '/') { 
        $path = '';
    }
 
    $absolute_url = "$host$path/$relative_url";
 
    $arr = array('#(/\.?/)#', '#/(?!\.\.)[^/]+/\.\./#');
     
    for ($n = 1; $n > 0; $absolute_url = preg_replace($arr, '/', $absolute_url, -1, $n)) {}
 
    return $scheme.'://'.$absolute_url;
}
 
      
$url = "https://www.collectivesolver.com/"; 
$base_url = "https://www.collectivesolver.com/";    
 
$html = file_get_contents($url);
 
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");

for ($i = 0; $i < $hrefs->length; $i++) {
    $href = $hrefs->item($i);
    $url = $href->getAttribute('href');
    $url = filter_var($url, FILTER_SANITIZE_URL);
    $link_absolute = relative_path_to_absolute_url($url, $base_url);
    
    if (!filter_var($link_absolute, FILTER_VALIDATE_URL) === false) {
         echo $link_absolute . '<br />';
    }
}



/*
run:
     
https://www.collectivesolver.com/
https://www.collectivesolver.com/questions
https://www.collectivesolver.com/tags
https://www.collectivesolver.com/users
https://www.collectivesolver.com/tag/python
https://www.collectivesolver.com/tag/php
https://www.collectivesolver.com/tag/c%23
https://www.collectivesolver.com/tag/java
https://www.collectivesolver.com/tag/cpp
https://www.collectivesolver.com/tag/c
https://www.collectivesolver.com/tag/javascript
https://www.collectivesolver.com/tag/vb%23
...
      
*/

 




answered Sep 14, 2019 by avibootz
...