# standard python library imports# 3rd party importsfrom scrapy.contrib.spiders import crawlspider, rulefrom scrapy.contrib.linkextractors.sgml import sgmllinkextractorfrom scrapy.selector import htmlxpathselector# my importsfrom poetry_analysis.items import poetryanalysisitemhtml_file_name = r'.+\.html'class poetryparser(object): provides common parsing method for poems formatted this one specific way. date_pattern = r'(\d{2} \w{3,9} \d{4})' def parse_poem(self, response): hxs = htmlxpathselector(response) item = poetryanalysisitem() # all poetry text is in pre tags text = hxs.select('//pre/text()').extract() item['text'] = ''.join(text) item['url'] = response.url # head/title contains title - a poem by author title_text = hxs.select('//head/title/text()').extract()[0] item['title'], item['author'] = title_text.split(' - ') item['author'] = item['author'].replace('a poem by', '') for key in ['title', 'author']: item[key] = item[key].strip() item['date'] = hxs.select(//p[@class='small']/text()).re(date_pattern) return itemclass poetryspider(crawlspider, poetryparser): name = 'example.com_poetry' allowed_domains = ['www.example.com'] root_path = 'someuser/poetry/' start_urls = ['http://www.example.com/someuser/poetry/recent/', 'http://www.example.com/someuser/poetry/less_recent/'] rules = [rule(sgmllinkextractor(allow=[start_urls[0] + html_file_name]), callback='parse_poem'), rule(sgmllinkextractor(allow=[start_urls[1] + html_file_name]), callback='parse_poem')]
希望本文所述对大家的python程序设计有所帮助。
