新浪股票信息爬取

新浪股票信息爬取

老师想要通过数据做股票预测,我那帮老师爬取了一点数据。
大一时候写的代码了,比较蠢。
不做解释了,只贴上当时的代码。留作纪念吧。

1. 爬取股票列表

如果要爬取各各股票的信息,首先要有股票的代码,所以先爬去股票的编号为后续做准备

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# -*- coding:UTF-8 -*-
import requests
from lxml import etree
from pymongo import MongoClient
import re
import pandas as pd

client = MongoClient()
db = client.gupiao
my_set = db.gupiao_number
url = 'http://quote.eastmoney.com/stocklist.html#sz'
r = requests.get(url)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
number_list = s.xpath('//*[@id="quotesearch"]//a[@target]/text()')
count = 1
for i in number_list:
count += 1
print("正在插入第{}条数据".format(count))
name = re.split('\(|\)', i)[0:2]
my_set.insert(dict(name=name[0], number=name[1], ))

gupiao = re.match(r'(.*)(\d)', number_list[2])
print(len(number_list))
df = pd.DataFrame(number_list)
df.to_csv('number_list.csv', encoding='utf-8-sig')

2. 通过股票编号爬取各各股票近十年的各个季度的股票数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding:UTF-8 -*-
from datetime import datetime
import requests
from lxml import etree
from pymongo import MongoClient

client = MongoClient()
db = client.gupiaoDB
my_set = db.fulldata

db_number = client.gupiao
my_number = db_number.gupiao_number

cursor = my_number.find()
my_number_list = []
my_number_list_new = []
def get_gupiao_number():
for i in cursor:
my_number_list.append(i['number'])
print("获取股票编号成功")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))

def get_data():
id_duan = '300285'
my_number_list_new = my_number_list[my_number_list.index(id_duan) + 1:]

for i in my_number_list_new:
try:
full_data_list = dict()
url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/' \
'stockid/{}/ctrl/2017/displaytype/4.phtml'.format(i)
r = requests.get(url)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)

year_list = s.xpath('//*[@id="con02-1"]/table/td/a/text()')[1:]

print("爬取进程:{:.2%}\n".format((my_number_list_new.index(i)+1) / len(my_number_list_new)))
print("本次运行已经保存{}支股票\n".format(my_number_list_new.index(i)+1))
print("剩余{}支股票\n".format(len(my_number_list_new)-my_number_list_new.index(i)))
print("正在爬取{}号股票\n".format(i))

dict_2017 = get_2017_data(i, 2017, s)
if dict_2017:
full_data_list.update(dict_2017)
else:
continue

for j in year_list:
try:
dict_usua = get_usual_data(i, j)
full_data_list.update(dict_usua)
except Exception as e:
print("异常:{}".format(e))
continue

my_set.insert(dict(full_data_list))
print("{}号股票所有数据保存成功\t".format(i))
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))

except Exception as e:
print("异常:{}".format(e))
continue


def get_2017_data(gupiao_number, year, s):

data_1 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[1]/a/text()')
data_2 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[2]/text()')[1:]
data_3 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[3]/text()')[1:]
data_4 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[4]/text()')[1:]
my_gupiao_name = s.xpath('//*[@id="toolbar"]/div[1]/h1/a/text()')

res_1 = dict(zip(data_1, data_2)) # 月份组合数据
res_2 = dict(zip(data_1, data_3))
res_3 = dict(zip(data_1, data_4))
time = dict()
if my_gupiao_name:
time['id'] = gupiao_number
time['name'] =my_gupiao_name[0]

time['{}-12-31'.format(year)] = res_1
time['{}-9-30'.format(year)] = res_2
time['{}-6-30'.format(year)] = res_3
if my_gupiao_name:
if not data_2:
print('{}号股票2017-12-31数据不存在'.format(gupiao_number))
del time['{}-12-31'.format(year)]
if not data_3:
print('{}号股票2017-9-30数据不存在'.format(gupiao_number))
del time['{}-9-30'.format(year)]
if not data_4:
print('{}号股票2017-6-30数据不存在'.format(gupiao_number))
del time['{}-6-30'.format(year)]

print("\t{}号股票2017年数据返回成功\t".format(gupiao_number))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
return time
else:
print("\t{}号股票2017年数据为空\t".format(gupiao_number))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))

def get_usual_data(gupiao_number, year):

url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/' \
'stockid/{}/ctrl/{}/displaytype/4.phtml'.format(gupiao_number,year)

r = requests.get(url)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
data_1 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[1]/a/text()')
data_2 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[2]/text()')[1:]
data_3 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[3]/text()')[1:]
data_4 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[4]/text()')[1:]
data_5 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[5]/text()')[1:]
my_gupiao_name = s.xpath('//*[@id="toolbar"]/div[1]/h1/a/text()')

res_1 = dict(zip(data_1, data_2)) # 月份组合数据
res_2 = dict(zip(data_1, data_3))
res_3 = dict(zip(data_1, data_4))
res_4 = dict(zip(data_1, data_5))
time = dict()
if my_gupiao_name:
time['id'] = gupiao_number

time['name'] = my_gupiao_name[0]
time['{}-12-31'.format(year)] = res_1
time['{}-9-30'.format(year)] = res_2
time['{}-6-30'.format(year)] = res_3
time['{}-3-31'.format(year)] = res_4

if my_gupiao_name:
print("\t{}号股票{}年数据返回成功\t".format(gupiao_number, year))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
if not data_2:
print("\t{}号股票{}-12-31数据为空\t".format(gupiao_number, year))
del time['{}-12-31'.format(year)]
if not data_3:
print("\t{}号股票{}-9-30数据为空\t".format(gupiao_number, year))
del time['{}-9-30'.format(year)]
if not data_4:
print("\t{}号股票{}-6-30数据为空\t".format(gupiao_number, year))
del time['{}-6-30'.format(year)]
if not data_5:
print("\t{}号股票{}-3-31数据为空\t".format(gupiao_number, year))
del time['{}-3-31'.format(year)]

print("\t{}号股票{}年数据返回成功\t".format(gupiao_number, year))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
return time
else:
print("\t{}号股票{}年数据为空\t".format(gupiao_number, year))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))

if __name__ == '__main__':
get_gupiao_number()
get_data()

3. 通过股票列表爬取股票近期的复权数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import random

import requests
from datetime import datetime
from lxml import etree
import pymysql
import pandas as pd


def get_gupiao_list():
df_number = pd.read_csv(r'C:\Users\15810\Desktop\python_code\Pctest\gupiao.csv',
encoding='utf-8', usecols=[2])

df_name = pd.read_csv(r'C:\Users\15810\Desktop\python_code\Pctest\gupiao.csv',
encoding='utf-8', usecols=[3])
df_number_list = list()
for i in df_number.values:
df_number_list.extend(list(i))
df_name_list = list()
for i in df_name.values:
df_name_list.extend(list(i))
gupiao_full_data = dict(zip(df_number_list, df_name_list))
return gupiao_full_data, df_number_list


def get_data(gupiao_full_data, df_number_list, table_name):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ',
'(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
]
ua = {'User-Agent': str(random.choices(ua_list))}
creat_table(table_name) # 创建表
for i in gupiao_full_data.keys():
# i = 601818
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
print("正在爬取第{}只股票".format(df_number_list.index(i) + 1))
print("进度{:.2%}".format((df_number_list.index(i) + 1) / len(df_number_list)))
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/{}.phtml?year=2017&jidu=4'
r = requests.get(url.format(i), headers=ua)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
year = s.xpath('//*[@id="con02-4"]/table[1]/tr/td/form/select[@name="year"]/option/text()')
jidu = s.xpath('//*[@id="con02-4"]/table[1]/tr/td/form/select[@name="jidu"]/option/text()')
jidu = [x + 1 for x in range(len(jidu))]
for j in year:
for k in jidu:
print("\t正在爬取{}股票{}年第{}季度数据".format(i, j, k))
tar_data(i, j, k, table_name)
# time.sleep(0.1)


def tar_data(gupiao_number, year, jidu, table_name):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ',
'(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
]
ua = {'User-Agent': str(random.choices(ua_list))}
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/' \
'stockid/{}.phtml?year={}&jidu={}'
r = requests.get(url.format(gupiao_number, year, jidu), headers=ua)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
# title = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/td//div/strong/text()') #表头
data = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/following-sibling::*/td/div/text()') # 表格数据
date = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/following-sibling::*/td/div/a/text()') # 日期
date = [x.strip() for x in date] # 日期处理
if len(date) == 0:
# data = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/following-sibling::*/td/div/text()')
size = int(len(data) / 8) # 表格数据格式处理
full_data = [] # 整合后的最终数据列表
for i in range(size):
start = i * 8
end = start + 8
full_data.append(data[start: end])

for i in full_data:
i[0] = i[0].strip()
for i in range(len(full_data)):
full_data[i].insert(0, str(gupiao_number)) # 整合股票号
full_data[i].append(gupiao_full_data[gupiao_number])
insert_data(table_name, full_data[i]) # 插入数据

else:
size = int(len(data) / 9) # 表格数据格式处理
full_data = [] # 整合后的最终数据列表
for i in range(size):
start = i * 9
end = start + 9
full_data.append(data[start: end][2:])
if len(date) != size:
print("\t{}号股票{}年数据第{}季度跳过成功".format(gupiao_number, year, jidu))
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
return
for i in range(len(full_data)):
# full_data[i].append(date[i]) # 整合日期
full_data[i].insert(0, date[i]) # 整合日期
full_data[i].insert(0, str(gupiao_number)) # 整合股票号
full_data[i].append(gupiao_full_data[gupiao_number])
insert_data(table_name, full_data[i]) # 插入数据
print("\t数据插入成功")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))


def creat_table(table_name):
connection = pymysql.connect(host='localhost',
user='root',
password='0000',
db='fuquan',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
try:
with connection.cursor() as cursor:
sql = r'CREATE TABLE `{}` (' \
'`股票号` varchar(255) NOT NULL ,' \
'`日期` date NOT NULL ,' \
'`开盘价` float NULL ,' \
'`最高价` float NULL ,' \
'`收盘价` float NULL ,' \
'`最低价` float NULL ,' \
'`交易量(股)` float NULL ,' \
'`交易金额(元)` float NULL ,' \
'`复权因子` float NULL ,' \
'`股票名` varchar(255) NULL ,PRIMARY KEY (`股票号`, `日期`));'.format(table_name)
cursor.execute(sql)
print("创建{}表成功".format(table_name))
finally:
connection.close()


def insert_data(table_name, values):
connection = pymysql.connect(host='localhost',
user='root',
password='0000',
db='fuquan',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
try:
with connection.cursor() as cursor:
sql = 'INSERT INTO `{}` (`股票号`,`日期`, `开盘价`, `最高价`, `收盘价`,`最低价`, `交易量(股)`, `交易金额(元)`, `复权因子`,`股票名`) ' \
'VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'.format(table_name)
cursor.execute(sql, values)
connection.commit()
finally:
connection.close()


if __name__ == '__main__':
table_name = 'fuquan_table'
gupiao_full_data, df_number_list = get_gupiao_list()
get_data(gupiao_full_data, df_number_list, table_name)
-------------End Of This ArticleThank You For Reading-------------