如果你希望在一个写好的程序里调用scrapy,就可以通过下面的代码,让scrapy运行在一个线程里。
code to run scrapy crawler in a thread - works on scrapy 0.8import threading, queuefrom twisted.internet import reactorfrom scrapy.xlib.pydispatch import dispatcherfrom scrapy.core.manager import scrapymanagerfrom scrapy.core.engine import scrapyenginefrom scrapy.core import signalsclass crawlerthread(threading.thread): def __init__(self): threading.thread.__init__(self) self.running = false def run(self): self.running = true scrapymanager.configure(control_reactor=false) scrapymanager.start() reactor.run(installsignalhandlers=false) def crawl(self, *args): if not self.running: raise runtimeerror(crawlerthread not running) self._call_and_block_until_signal(signals.spider_closed, \ scrapymanager.crawl, *args) def stop(self): reactor.callfromthread(scrapyengine.stop) def _call_and_block_until_signal(self, signal, f, *a, **kw): q = queue.queue() def unblock(): q.put(none) dispatcher.connect(unblock, signal=signal) reactor.callfromthread(f, *a, **kw) q.get()# usage example below: import osos.environ.setdefault('scrapy_settings_module', 'myproject.settings')from scrapy.xlib.pydispatch import dispatcherfrom scrapy.core import signalsfrom scrapy.conf import settingsfrom scrapy.crawler import crawlerthreadsettings.overrides['log_enabled'] = false # avoid log noisedef item_passed(item): print just scraped item:, itemdispatcher.connect(item_passed, signal=signals.item_passed)crawler = crawlerthread()print starting crawler thread...crawler.start()print crawling somedomain.com....crawler.crawl('somedomain.com) # blocking callprint crawling anotherdomain.com...crawler.crawl('anotherdomain.com') # blocking callprint stopping crawler thread...crawler.stop()
希望本文所述对大家的python程序设计有所帮助。
