1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| <?php
ini_set("display_errors",1);
error_reporting(E_ALL);
$already_crawled = array();
$crawling = array();
function get_details($url) {
$options = array('http'=>array('method'=>"GET"));
$context = stream_context_create($options);
libxml_set_streams_context($context);
libxml_use_internal_errors(true);
$doc = new DOMDocument;
$doc->loadHTMLFile('<?xml encoding="UTF-8">' . $url);
$title = $doc->getElementsByTagName("title");
$title = $title->item(0)->nodeValue;
return '{ "Title": "'.str_replace("\n", "", $title).'", "URL": "'.$url.'"},';
}
function follow_links($url) {
global $already_crawled;
global $crawling;
$options = array('http'=>array('method'=>"GET", 'headers'=>"User-Agent: howCode/0.1\n"));
$context = stream_context_create($options);
libxml_set_streams_context($context);
libxml_use_internal_errors(true);
$doc = new DOMDocument;
$doc->loadHTMLFile('<?xml encoding="UTF-8">' . $url);
$linklist = $doc->getElementsByTagName("a");
foreach ($linklist as $link) {
$l = $link->getAttribute("href");
if (substr($l, 0, 5) != "https" && substr($l, 0, 4) != "http") {
$l = parse_url($url)["scheme"]."://".parse_url($url)["host"]."/".$l;
}
if (!in_array($l, $already_crawled)) {
$already_crawled[] = $l;
$crawling[] = $l;
echo get_details($l)."\n";
}
}
array_shift($crawling);
foreach ($crawling as $site) {
follow_links($site);
}
}
follow_links("youtube.com"); |
Partager