Scrapy结合Mysql爬取天气预报入库
创建Scrapy工程:
1
|
scrapy startproject weather2 |
定义Items(items.py):
1
2
3
4
5
6
7
8
9
10
11
|
import scrapy
class Weather2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
weatherDate = scrapy.Field()
weatherDate2 = scrapy.Field()
weatherWea = scrapy.Field()
weatherTem1 = scrapy.Field()
weatherTem2 = scrapy.Field()
weatherWin = scrapy.Field()
|
编写Spider(spiders/weatherSpider.py):
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
import scrapy
from weather2.items import Weather2Item
class CatchWeatherSpider(scrapy.Spider):
name = 'CatchWeather2'
allowed_domains = [ 'weather.com.cn' ]
start_urls = [
"http://www.weather.com.cn/weather/101280101.shtml"
]
def parse( self , response):
for sel in response.xpath( '//*[@id="7d"]/ul/li' ):
item = Weather2Item()
item[ 'weatherDate' ] = sel.xpath( 'h1/text()' ).extract()
item[ 'weatherDate2' ] = sel.xpath( 'h2/text()' ).extract()
item[ 'weatherWea' ] = sel.xpath( 'p[@class="wea"]/text()' ).extract()
item[ 'weatherTem1' ] = sel.xpath( 'p[@class="tem tem1"]/span/text()' ).extract() + sel.xpath( 'p[@class="tem tem1"]/i/text()' ).extract()
item[ 'weatherTem2' ] = sel.xpath( 'p[@class="tem tem2"]/span/text()' ).extract() + sel.xpath( 'p[@class="tem tem2"]/i/text()' ).extract()
item[ 'weatherWin' ] = sel.xpath( 'p[@class="win"]/i/text()' ).extract()
yield item
|
-
name:定义蜘蛛的名字。
-
allowed_domains: 包含构成许可域的基础URL,供蜘蛛去爬。
-
start_urls: 是一个URL列表,蜘蛛从这里开始爬。蜘蛛从start_urls中的URL下载数据,所有后续的URL将从这些数据中获取。
数据来源是http://www.weather.com.cn/weather/101280101.shtml,101280101是广州的城市编号
这里用到了xpath分析html,感觉好简单
测试运行:
1
|
scrapy crawl CatchWeather2 |
结果片断:
已经拿到我们想要的数据
创建数据库:
1
2
3
4
5
6
7
8
9
10
11
|
CREATE TABLE `yunweiApp_weather` (
`id` int (11) NOT NULL AUTO_INCREMENT,
`weatherDate` varchar (10) DEFAULT NULL ,
`weatherDate2` varchar (10) NOT NULL ,
`weatherWea` varchar (10) NOT NULL ,
`weatherTem1` varchar (10) NOT NULL ,
`weatherTem2` varchar (10) NOT NULL ,
`weatherWin` varchar (10) NOT NULL ,
`updateTime` datetime NOT NULL ,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=15 DEFAULT CHARSET=utf8;
|
创建PipeLines():
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
import MySQLdb
import datetime
DEBUG = True
if DEBUG:
dbuser = 'lihuipeng'
dbpass = 'lihuipeng'
dbname = 'game_main'
dbhost = '192.168.1.100'
dbport = '3306'
else :
dbuser = 'root'
dbpass = 'lihuipeng'
dbname = 'game_main'
dbhost = '127.0.0.1'
dbport = '3306'
class MySQLStorePipeline( object ):
def __init__( self ):
self .conn = MySQLdb.connect(user = dbuser, passwd = dbpass, db = dbname, host = dbhost, charset = "utf8" , use_unicode = True )
self .cursor = self .conn.cursor()
#清空表:
self .cursor.execute( "truncate table yunweiApp_weather;" )
self .conn.commit()
def process_item( self , item, spider):
curTime = datetime.datetime.now()
try :
self .cursor.execute( """INSERT INTO yunweiApp_weather (weatherDate, weatherDate2, weatherWea, weatherTem1, weatherTem2, weatherWin, updateTime)
VALUES (%s, %s, %s, %s, %s, %s, %s)""" ,
(
item[ 'weatherDate' ][ 0 ].encode( 'utf-8' ),
item[ 'weatherDate2' ][ 0 ].encode( 'utf-8' ),
item[ 'weatherWea' ][ 0 ].encode( 'utf-8' ),
item[ 'weatherTem1' ][ 0 ].encode( 'utf-8' ),
item[ 'weatherTem2' ][ 0 ].encode( 'utf-8' ),
item[ 'weatherWin' ][ 0 ].encode( 'utf-8' ),
curTime,
)
)
self .conn.commit()
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[ 0 ], e.args[ 1 ])
return item
|
修改setting.py启用pipelines:
1
2
3
4
|
ITEM_PIPELINES = { #'weather2.pipelines.Weather2Pipeline': 300,
'weather2.pipelines.MySQLStorePipeline' : 400,
} |
后面的数字只是一个权重,范围在0-1000内即可
重新测试运行:
1
|
scrapy crawl CatchWeather2 |
结果:
搞完收工~~
本文转自运维笔记博客51CTO博客,原文链接http://blog.51cto.com/lihuipeng/1711852如需转载请自行联系原作者
lihuipeng