1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
| import scrapy
from tutorial.items import TutorialItem import time from scrapy.spiders import CrawlSpider import re import requests from lxml import etree
import sys reload(sys) sys.setdefaultencoding('utf-8')
class spider(CrawlSpider): name = "cnblogsSpider"
allowed_domains = ["cnblogs.com"]
start_urls = [ "http://www.cnblogs.com/AllBloggers.aspx", ]
def parse(self, response): sel=response.selector userUrls=sel.xpath('//td/a[1]/@href').extract() for userUrl in userUrls: yield scrapy.Request(userUrl, callback=self.parse_cagetory) print '1'
def parse_cagetory(self,response): sel=response.selector scr = sel.xpath('//script/text()').extract()[0] pat = "currentBlogApp =.+'" st = re.findall(pat, scr)[0] currentBlogApp = st.replace("currentBlogApp = '", "").replace("'", "")
url='http://www.cnblogs.com/'+currentBlogApp+'/mvc/blog/sidecolumn.aspx?blogApp='+currentBlogApp r = requests.get(url) sel = etree.HTML(r.content) categoryUrls=sel.xpath('//div[@id="sidebar_categories"]//a/@href') print '-------' print len(categoryUrls) for categoryUrl in categoryUrls: print 'categoryUrl'+categoryUrl yield scrapy.Request(categoryUrl,callback=self.parse_itemList)
def parse_itemList(self,response): sel=response.selector print '++++++' postTitleUrls=sel.xpath('//div[@class="entrylistPosttitle"]/a/@href').extract() for titleUrl in postTitleUrls: print 'titleUrl'+titleUrl yield scrapy.Request(titleUrl,callback=self.parse_item)
def parse_item(self, response): sel = response.selector item = TutorialItem() print '??????????' scr = sel.xpath('//script/text()').extract()[0] pat = "currentBlogApp =.+'" st = re.findall(pat, scr)[0] currentBlogApp = st.replace("currentBlogApp = '", "").replace("'", "")
scr = sel.xpath('//script/text()').extract()[1] pat = 'cb_blogId=.+,cb_entryId' st = re.findall(pat, scr)[0] cb_blogId = st.replace('cb_blogId=', '').replace(',cb_entryId', '')
scr = sel.xpath('//script/text()').extract()[1] pat = 'cb_entryId=.+,cb_blogApp' st = re.findall(pat, scr)[0] cb_entryId = st.replace('cb_entryId=', '').replace(',cb_blogApp', '')
requestDataDict={} pat = 'cb_blogId.+,cb_entryCreatedDate' js = sel.xpath('//div[@id="topics"]/script/text()').extract_first() strs=re.findall(pat, js)[0] tmpList=strs.split(',')[0:4] for s in tmpList: emeList=s.split('=') requestDataDict[emeList[0]]=emeList[1]
title=sel.xpath('//a[@id="cb_post_title_url"]/text()').extract_first()
item["title"] = title publishDate = sel.xpath('//span[@id="post-date"]/text()').extract_first() item["publishDate"] = (publishDate is not None and [publishDate.encode("utf-8")] or [""])[0]
url=response.url item["url"]=url
plist=sel.xpath('//div[@id="cnblogs_post_body"]//text()').extract() content='' for s in plist: p = (s is not None and [s.decode("utf-8")] or [""])[0] content=content+' '+p item["content"]=content
r = requests.get( 'http://www.cnblogs.com/mvc/blog/CategoriesTags.aspx?blogApp='+currentBlogApp+'&blogId='+cb_blogId+'&postId='+cb_entryId) s = r.json().get('Categories') pat = '>.*?</a>' lis = re.findall(pat, s) keywords = lis[0].replace('>', '').replace('</a', '') for i in xrange(len(lis)-1): tmpword=lis[i+1].replace('>', '').replace('</a', '') keywords=keywords+' '+tmpword item["keywords"]=keywords
scrapyDate=time.strftime('%Y-%m-%d %H:%M',time.localtime(time.time())) item["scrapyDate"]=(scrapyDate is not None and [scrapyDate.encode("utf-8")] or [""])[0]
yield item
|