1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
|
class Crawl2Spider(BaseSpider):
name = "crawl2"
import MySQLdb
db = MySQLdb.connect(host="localhost", user="root", passwd="", db="crawler_engine", charset = 'utf8', use_unicode = False)
cur = db.cursor()
cur2 = db.cursor()
cur.execute("select url from urls where num_crawl=1")
vers = cur.fetchall()
for i in range(cur.rowcount):
#liste des urls que nous allons parcourir qui est le resultat de la requete precedente
start_urls = vers[i]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = DmozItem()
item['link'] = hxs.select('//div/ul/li/a/@href').extract()
cursor = self.db.cursor()
for j in range(len(item['link'])):
cursor = self.db.cursor()
sql = "insert into urls(url, domain, num_crawl) values ('%s','%s','%s')" % (item['link'][j],'test', 1)
cursor.execute(sql)
self.db.commit()
return item |
Partager