{"id":223078,"date":"2021-08-09T10:15:45","date_gmt":"2021-08-09T02:15:45","guid":{"rendered":"https:\/\/lrxjmw.cn\/?p=223078"},"modified":"2021-07-27T13:17:20","modified_gmt":"2021-07-27T05:17:20","slug":"python-scrapy-imagespipeline","status":"publish","type":"post","link":"https:\/\/lrxjmw.cn\/python-scrapy-imagespipeline.html","title":{"rendered":"Python\u4e2dscrapy\u4e0b\u8f7d\u4fdd\u5b58\u56fe\u7247"},"content":{"rendered":"\n\n\n

\u5bfc\u8bfb<\/td>\n

\u5728\u65e5\u5e38\u722c\u866b\u7ec3\u4e60\u4e2d\uff0c\u6211\u4eec\u722c\u53d6\u5230\u7684\u6570\u636e\u9700\u8981\u8fdb\u884c\u4fdd\u5b58\u64cd\u4f5c\uff0c\u5728scrapy\u4e2d\u6211\u4eec\u53ef\u4ee5\u4f7f\u7528ImagesPipeline\u8fd9\u4e2a\u7c7b\u6765\u8fdb\u884c\u76f8\u5173\u64cd\u4f5c\uff0c\u8fd9\u4e2a\u7c7b\u662fscrapy\u5df2\u7ecf\u5c01\u88c5\u597d\u7684\u4e86\uff0c\u6211\u4eec\u76f4\u63a5\u62ff\u6765\u7528\u5373\u53ef\u3002<\/strong><\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n
$\"\"$ <\/p>\n
\u5728\u4f7f\u7528ImagesPipeline\u4e0b\u8f7d\u56fe\u7247\u6570\u636e\u65f6\uff0c\u6211\u4eec\u9700\u8981\u5bf9\u5176\u4e2d\u7684\u4e09\u4e2a\u7ba1\u9053\u7c7b\u65b9\u6cd5\u8fdb\u884c\u91cd\u5199\uff0c\u5176\u4e2d \u2014 get_media_request \u662f\u5bf9\u56fe\u7247\u5730\u5740\u53d1\u8d77\u8bf7\u6c42<\/p>\n
\u2014 file path \u662f\u8fd4\u56de\u56fe\u7247\u540d\u79f0<\/p>\n
\u2014 item_completed \u8fd4\u56deitem,\u5c06\u5176\u8fd4\u56de\u7ed9\u4e0b\u4e00\u4e2a\u5373\u5c06\u88ab\u6267\u884c\u7684\u7ba1\u9053\u7c7b<\/p>\n
$\"\"$ <\/p>\n
\u90a3\u5177\u4f53\u4ee3\u7801\u662f\u4ec0\u4e48\u6837\u7684\u5462\uff0c\u9996\u5148\u6211\u4eec\u9700\u8981\u5728pipelines.py\u6587\u4ef6\u4e2d\uff0c\u5bfc\u5165ImagesPipeline\u7c7b\uff0c\u7136\u540e\u91cd\u5199\u4e0a\u8ff0\u6240\u8bf4\u76843\u4e2a\u65b9\u6cd5\uff1a<\/p>\n
\r\nfrom scrapy.pipelines.images import ImagesPipeline\r\nimport scrapy\r\nimport os\r\n \r\n \r\nclass ImgsPipLine(ImagesPipeline):\r\n def get_media_requests(self, item, info):\r\n yield scrapy.Request(url = item['img_src'],meta={'item':item})\r\n \r\n \r\n #\u8fd4\u56de\u56fe\u7247\u540d\u79f0\u5373\u53ef\r\n def file_path(self, request, response=None, info=None):\r\n item = request.meta['item']\r\n print('########',item)\r\n filePath = item['img_name']\r\n return filePath\r\n \r\n def item_completed(self, results, item, info):\r\n return item\r\n<\/pre>\n
\u65b9\u6cd5\u5b9a\u4e49\u597d\u540e\uff0c\u6211\u4eec\u9700\u8981\u5728settings.py\u914d\u7f6e\u6587\u4ef6\u4e2d\u8fdb\u884c\u8bbe\u7f6e\uff0c\u4e00\u4e2a\u662f\u6307\u5b9a\u56fe\u7247\u4fdd\u5b58\u7684\u4f4d\u7f6eIMAGES_STORE = 'D:\\\\ImgPro'\uff0c\u7136\u540e\u5c31\u662f\u542f\u7528\u201cImgsPipLine\u201d\u7ba1\u9053\uff0c<\/p>\n
\r\nITEM_PIPELINES = {\r\n 'imgPro.pipelines.ImgsPipLine': 300, #300\u4ee3\u8868\u4f18\u5148\u7ea7\uff0c\u6570\u5b57\u8d8a\u5c0f\u4f18\u5148\u7ea7\u8d8a\u9ad8\r\n}\r\n<\/pre>\n
\u8bbe\u7f6e\u5b8c\u6210\u540e\uff0c\u6211\u4eec\u8fd0\u884c\u7a0b\u5e8f\u540e\u5c31\u53ef\u4ee5\u770b\u5230\u201cD:\\\\ImgPro\u201d\u4e0b\u4fdd\u5b58\u6210\u529f\u7684\u56fe\u7247\u3002<\/p>\n
\u5b8c\u6574\u4ee3\u7801\u5982\u4e0b\uff1a<\/p>\n
spider\u6587\u4ef6\u4ee3\u7801\uff1a<\/p>\n
\r\n# -*- coding: utf-8 -*-\r\nimport scrapy\r\nfrom imgPro.items import ImgproItem\r\n \r\n \r\n \r\nclass ImgSpider(scrapy.Spider):\r\n name = 'img'\r\n allowed_domains = ['www.521609.com']\r\n start_urls = ['http:\/\/www.521609.com\/daxuemeinv\/']\r\n \r\n def parse(self, response):\r\n #\u89e3\u6790\u56fe\u7247\u5730\u5740\u548c\u56fe\u7247\u540d\u79f0\r\n li_list = response.xpath('\/\/div[@class=\"index_img list_center\"]\/ul\/li')\r\n for li in li_list:\r\n item = ImgproItem()\r\n item['img_src'] = 'http:\/\/www.521609.com\/' + li.xpath('.\/a[1]\/img\/@src').extract_first()\r\n item['img_name'] = li.xpath('.\/a[1]\/img\/@alt').extract_first() + '.jpg'\r\n # print('***********')\r\n # print(item)\r\n yield item\r\n<\/pre>\n
items.py\u6587\u4ef6<\/p>\n
import scrapy\r\n \r\n \r\nclass ImgproItem(scrapy.Item):\r\n # define the fields for your item here like:\r\n # name = scrapy.Field()\r\n img_src = scrapy.Field()\r\n img_name = scrapy.Field()\r\n<\/pre>\n
pipelines.py\u6587\u4ef6<\/p>\n
\r\nfrom scrapy.pipelines.images import ImagesPipeline\r\nimport scrapy\r\nimport os\r\nfrom imgPro.settings import IMAGES_STORE as IMGS\r\n \r\nclass ImgsPipLine(ImagesPipeline):\r\n def get_media_requests(self, item, info):\r\n yield scrapy.Request(url = item['img_src'],meta={'item':item})\r\n \r\n \r\n #\u8fd4\u56de\u56fe\u7247\u540d\u79f0\u5373\u53ef\r\n def file_path(self, request, response=None, info=None):\r\n item = request.meta['item']\r\n print('########',item)\r\n filePath = item['img_name']\r\n return filePath\r\n \r\n def item_completed(self, results, item, info):\r\n return item\r\n<\/pre>\n
settings.py\u6587\u4ef6<\/p>\n
\r\nimport random\r\nBOT_NAME = 'imgPro'\r\n \r\nSPIDER_MODULES = ['imgPro.spiders']\r\nNEWSPIDER_MODULE = 'imgPro.spiders'\r\n \r\nIMAGES_STORE = 'D:\\\\ImgPro' #\u6587\u4ef6\u4fdd\u5b58\u8def\u5f84\r\nLOG_LEVEL = \"WARNING\"\r\nROBOTSTXT_OBEY = False\r\n#\u8bbe\u7f6euser-agent\r\nUSER_AGENTS_LIST = [\r\n \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.1 (KHTML, like Gecko) Chrome\/22.0.1207.1 Safari\/537.1\",\r\n \"Mozilla\/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit\/536.11 (KHTML, like Gecko) Chrome\/20.0.1132.57 Safari\/536.11\",\r\n \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/536.6 (KHTML, like Gecko) Chrome\/20.0.1092.0 Safari\/536.6\",\r\n \"Mozilla\/5.0 (Windows NT 6.2) AppleWebKit\/536.6 (KHTML, like Gecko) Chrome\/20.0.1090.0 Safari\/536.6\",\r\n \"Mozilla\/5.0 (Windows NT 6.2; WOW64) AppleWebKit\/537.1 (KHTML, like Gecko) Chrome\/19.77.34.5 Safari\/537.1\",\r\n \"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/536.5 (KHTML, like Gecko) Chrome\/19.0.1084.9 Safari\/536.5\",\r\n \"Mozilla\/5.0 (Windows NT 6.0) AppleWebKit\/536.5 (KHTML, like Gecko) Chrome\/19.0.1084.36 Safari\/536.5\",\r\n \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1063.0 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 5.1) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1063.0 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1063.0 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 6.2) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1062.0 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1062.0 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 6.2) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1061.1 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1061.1 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 6.1) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1061.1 Safari\/536.3\",\r\n \"Mozilla\/5.0 (Windows NT 6.2) AppleWebKit\/536.3 (KHTML, like Gecko) Chrome\/19.0.1061.0 Safari\/536.3\",\r\n \"Mozilla\/5.0 (X11; Linux x86_64) AppleWebKit\/535.24 (KHTML, like Gecko) Chrome\/19.0.1055.1 Safari\/535.24\",\r\n \"Mozilla\/5.0 (Windows NT 6.2; WOW64) AppleWebKit\/535.24 (KHTML, like Gecko) Chrome\/19.0.1055.1 Safari\/535.24\"\r\n ]\r\nUSER_AGENT = random.choice(USER_AGENTS_LIST)\r\nDEFAULT_REQUEST_HEADERS = {\r\n 'Accept': 'text\/html,application\/xhtml+xml,application\/xml;q=0.9,*\/*;q=0.8',\r\n 'Accept-Language': 'en',\r\n # 'User-Agent':\"Mozilla\/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/58.0.3029.110 Safari\/537.36\",\r\n 'User-Agent':USER_AGENT\r\n}\r\n \r\n#\u542f\u52a8pipeline\u7ba1\u9053\r\nITEM_PIPELINES = {\r\n 'imgPro.pipelines.ImgsPipLine': 300,\r\n}\r\n<\/pre>\n
\u4ee5\u4e0a\u5373\u662f\u4f7f\u7528ImagesPipeline\u4e0b\u8f7d\u4fdd\u5b58\u56fe\u7247\u7684\u65b9\u6cd5\uff0c\u4eca\u5929\u7a81\u751f\u4e00\u4e2a\u7591\u60d1\uff0c\u722c\u866b\u722c\u7684\u597d\uff0c\u771f\u7684\u662f\u7262\u996d\u5403\u7684\u9971\u5417\uff1f\u8fd8\u8bf7\u5404\u4f4d\u5927\u4f6c\u89e3\u7b54\uff01<\/p>\n","protected":false},"excerpt":{"rendered":"
\u5728\u4f7f\u7528ImagesPipeline\u4e0b\u8f7d\u56fe\u7247\u6570\u636e\u65f6\uff0c\u6211\u4eec\u9700\u8981\u5bf9\u5176\u4e2d\u7684\u4e09\u4e2a\u7ba1\u9053\u7c7b\u65b9\u6cd5\u8fdb\u884c\u91cd\u5199\uff0c\u5176\u4e2d \u2014 get_m […]<\/p>\n","protected":false},"author":668,"featured_media":223497,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[55],"tags":[],"class_list":["post-223078","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-thread"],"acf":[],"_links":{"self":[{"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/posts\/223078","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/users\/668"}],"replies":[{"embeddable":true,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/comments?post=223078"}],"version-history":[{"count":6,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/posts\/223078\/revisions"}],"predecessor-version":[{"id":223498,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/posts\/223078\/revisions\/223498"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/media\/223497"}],"wp:attachment":[{"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/media?parent=223078"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/categories?post=223078"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/lrxjmw.cn\/wp-json\/wp\/v2\/tags?post=223078"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}