1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
| function followLinks($urls) {
global $alreadyCrawled;
global $crawling;
$parser = new DomDocumentParser($page);
foreach($urls as $page) {
$linkList = $parser->getLinks();
foreach($linkList as $link) {
$href = $link->getAttribute("href");
if(strpos($href, "#") !== false) {
continue;
}
else if(substr($href, 0, 11) == "javascript:") {
continue;
}
$href = createLink($href, $url);
if(!in_array($href, $alreadyCrawled)) {
$alreadyCrawled[] = $href;
$crawling[] = $href;
$runtime = new \parallel\Runtime();
$crawling_arr_length = count($crawling);
$future = $runtime->run(function() {
for ($i=0; $i < $crawling_arr_length; $i++)
// Output the page title, descriptions, keywords, URL, Image, Video, etc... This output is
// piped off to an external file using the command line.
getDetails($href);
return "easy";
});
}
}
}
// Remove an item from the array after we have crawled it.
// This prevents infinitely crawling the same page.
array_shift($crawling);
followLinks($crawling);
}
$starts = ["https://website1.dn", "https://website2.dn", "https://website3.dn", "https://website4.dn"];
followLinks($starts); |
Partager