1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
|
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package aspirateur.refanor.aaa.etape1;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author sofiene
*/
public class Extraction3 {
int r = 0;
int code;
// public String Url = null;
URL urll;
public String EtapeExtraction (String Url) throws IOException{
r++;
//On se connecte au site et on charge le document html
org.jsoup.nodes.Document doc = Jsoup.connect(Url).get();
//On récupère dans ce document la premiere balise ayant comme nom h1 et pour attribut class="title"
Elements links = doc.select("a[href]");
System.out.println("\nlink Boucle : " +r+ links.get(r).attr("href"));
TestUrl ee = new TestUrl();
boolean resultat = ee.testUrla(links.get(r).attr("href"));
TestUrl ll = new TestUrl();
if (ll.testUrla(Url) == false){
// Url = "http://wwww.wwww.fr";
//Url = links.get(r).attr("href");
}
try {
urll = new URL (Url);
} catch (MalformedURLException ex) {
Logger.getLogger(Extraction3.class.getName()).log(Level.SEVERE, null, ex);
}
HttpURLConnection huc = null;
try {
huc = ( HttpURLConnection ) urll.openConnection ();
} catch (IOException ex) {
Logger.getLogger(Extraction3.class.getName()).log(Level.SEVERE, null, ex);
}
try {
huc.setRequestMethod ("GET"); //OR huc.setRequestMethod ("HEAD");
} catch (ProtocolException ex) {
Logger.getLogger(Extraction3.class.getName()).log(Level.SEVERE, null, ex);
}
try {
huc.connect () ;
} catch (IOException ex) {
Logger.getLogger(Extraction3.class.getName()).log(Level.SEVERE, null, ex);
}
try {
code = huc.getResponseCode();
System.out.println(code);
} catch (IOException ex) {
Logger.getLogger(Extraction3.class.getName()).log(Level.SEVERE, null, ex);
}
if (code != 200){
System.out.println("erreur de Url");
}
System.out.println(".......;;;"+Url);
if (r < 2000){
System.out.println("..."+r);
//System.out.println("..."+LesUrl);
for (Element LiensA : links){
System.out.println("???????"+LiensA.attr("href"));
Url = EtapeExtraction (LiensA.attr("href"));
}
}
return Url;
}
public static void main (String[] args){
Extraction3 ext3 = new Extraction3();
try {
ext3.EtapeExtraction("http://referencement-annuaire-web.fr/");
} catch (IOException ex) {
Logger.getLogger(Extraction3.class.getName()).log(Level.SEVERE, null, ex);
}
}
} |
Partager