Time Series and Stock data

Time Series and Stock data

本文是 Mastering pandas for finance 一书第四五章的学习笔记,介绍了 pandas 时间序列的处理相关操作。

准备工作

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import pandas as pd
import numpy as np
import tushare as ts
import matplotlib.pyplot as plt
from matplotlib import rcParams
pro = ts.pro_api('你的tushare密钥')
from matplotlib.font_manager import FontProperties
# 中文字体
cnfont = FontProperties(fname = '/Library/Fonts/Songti.ttc', size = 14)
# 英文字体
enfont = FontProperties(fname = '/Users/czx/Library/Fonts/RobotoSlab-Regular.ttf', size = 14)
# 解决负号'-'显示为方块的问题
rcParams['axes.unicode_minus'] = False
rcParams['savefig.dpi'] = 300 # 图片像素
rcParams['figure.dpi'] = 300 # 分辨率
import datetime
from datetime import datetime

时间序列数据

时间序列数据和日期索引

Python
1
2
3
4
5
6
7
8
dates = [datetime(2014, 8, 1), datetime(2014, 8,2)]
dti = pd.DatetimeIndex(dates)
dti
# DatetimeIndex(['2014-08-01', '2014-08-02'], dtype='datetime64[ns]', freq=None)
np.random.seed(123456)
ts = pd.Series(np.random.randn(2), dates)
type(ts.index)
# pandas.core.indexes.datetimes.DatetimeIndex

Python 清空变量:
reset

根据日期索引访问序列元素

Python
1
2
3
4
5
6
7
np.random.seed(123456)
dates = ['2014-08-01', '2014-08-02']
ts = pd.Series(np.random.randn(2), dates)
ts
# 2014-08-01 0.469112
# 2014-08-02 -0.282863
# dtype: float64
Python
1
2
3
dti = pd.to_datetime(['Aug 1, 2014', '20140802', '2014.08.03', None])
dti
# DatetimeIndex(['2014-08-01', '2014-08-02', '2014-08-03', 'NaT'], dtype='datetime64[ns]', freq=None)
Python
1
2
3
4
dti1 = pd.to_datetime(['8/1/2018'])
dti2 = pd.to_datetime(['1/8/2018'], dayfirst = True)
dti1[0], dti2[0]
# (Timestamp('2018-08-01 00:00:00'), Timestamp('2018-08-01 00:00:00'))

生成时间序列

Python
1
2
3
4
5
6
7
8
9
10
np.random.seed(123456)
dates = pd.date_range('8/1/2014', periods = 10)
s1 = pd.Series(np.random.randn(10), dates)
s1[:5]
# 2014-08-01 0.469112
# 2014-08-02 -0.282863
# 2014-08-03 -1.509059
# 2014-08-04 -1.135632
# 2014-08-05 1.212112
# Freq: D, dtype: float64
Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
pingan = pro.daily(
ts_code = "000001.SZ",
start_date = "20180101",
end_date = "20181231"
)
pingan.head()
# ts_code trade_date open ... pct_chg vol amount
# 0 000001.SZ 20181228 9.31 ... 1.0776 576604.00 541571.004
# 1 000001.SZ 20181227 9.45 ... -0.2151 624593.27 586343.755
# 2 000001.SZ 20181226 9.35 ... -0.4283 421140.60 393215.140
# 3 000001.SZ 20181225 9.29 ... -0.8493 586615.45 545235.607
# 4 000001.SZ 20181224 9.40 ... -0.3175 509117.67 477186.904
# [5 rows x 11 columns]
pingan.index = pd.to_datetime(pingan.trade_date)

按照日期顺序排序

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
pingan =  pingan.sort_index()
pingana = pingan['close']
pingana.head(3)
# trade_date
# 2018-01-02 13.70
# 2018-01-03 13.33
# 2018-01-04 13.25
# Name: close, dtype: float64
pingan['2018-12-27':'2018-12-28']
# ts_code trade_date ... vol amount
# trade_date ...
# 2018-12-27 000001.SZ 20181227 ... 624593.27 586343.755
# 2018-12-28 000001.SZ 20181228 ... 576604.00 541571.004
# [2 rows x 11 columns]
pingan.loc['2018-01-02']
# ts_code 000001.SZ
# trade_date 20180102
# open 13.35
# high 13.93
# low 13.32
# close 13.7
# pre_close 13.3
# change 0.4
# pct_chg 3.01
# vol 2.08159e+06
# amount 2.85654e+06
# Name: 2018-01-02 00:00:00, dtype: object
pingana['2018-01-02']
# 13.7

选择月份

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
pingana['2018-02'].head(5)
# trade_date
# 2018-02-01 14.03
# 2018-02-02 14.05
# 2018-02-05 14.55
# 2018-02-06 14.00
# 2018-02-07 12.92
# Name: close, dtype: float64
pingan['2018-02-01':'2018-02-09'][:5]
# ts_code trade_date ... vol amount
# trade_date ...
# 2018-02-01 000001.SZ 20180201 ... 2005614.75 2821583.609
# 2018-02-02 000001.SZ 20180202 ... 1176512.71 1637619.618
# 2018-02-05 000001.SZ 20180205 ... 2331997.90 3326355.314
# 2018-02-06 000001.SZ 20180206 ... 2582872.04 3641294.710
# 2018-02-07 000001.SZ 20180207 ... 3345716.97 4521586.590
# [5 rows x 11 columns]

创建指定频率的时间序列

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
bymin = pd.Series(np.arange(0, 90 * 60 * 24),
pd.date_range('2014-08-01', '2014-10-29 23:59:00', freq =
'T'))
bymin['2014-08-01 12:30': '2014-08-01 12:39']
# 2014-08-01 12:30:00 750
# 2014-08-01 12:31:00 751
# 2014-08-01 12:32:00 752
# 2014-08-01 12:33:00 753
# 2014-08-01 12:34:00 754
# 2014-08-01 12:35:00 755
# 2014-08-01 12:36:00 756
# 2014-08-01 12:37:00 757
# 2014-08-01 12:38:00 758
# 2014-08-01 12:39:00 759
# Freq: T, dtype: int64

aug2014 = pd.Period('2014-08', freq = "M")
aug2014
# Period('2014-08', 'M')
aug2014.start_time, aug2014.end_time
# (Timestamp('2014-08-01 00:00:00'), Timestamp('2014-08-31 23:59:59.999999999'))

sep2014 = aug2014 + 1
sep2014
# Period('2014-09', 'M')
sep2014.start_time, sep2014.end_time
# (Timestamp('2014-09-01 00:00:00'), Timestamp('2014-09-30 23:59:59.999999999'))

mp2013 = pd.period_range('1/1/2013', '12/31/2013', freq = 'M')
mp2013
# PeriodIndex(['2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06',
# '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'],
# dtype='period[M]', freq='M')
for p in mp2013:
print("{0} {1} {2} {3}".format(p, p.freq, p.start_time, p.end_time))
# 2013-01 <MonthEnd> 2013-01-01 00:00:00 2013-01-31 23:59:59.999999999
# 2013-02 <MonthEnd> 2013-02-01 00:00:00 2013-02-28 23:59:59.999999999
# 2013-03 <MonthEnd> 2013-03-01 00:00:00 2013-03-31 23:59:59.999999999
# 2013-04 <MonthEnd> 2013-04-01 00:00:00 2013-04-30 23:59:59.999999999
# 2013-05 <MonthEnd> 2013-05-01 00:00:00 2013-05-31 23:59:59.999999999
# 2013-06 <MonthEnd> 2013-06-01 00:00:00 2013-06-30 23:59:59.999999999
# 2013-07 <MonthEnd> 2013-07-01 00:00:00 2013-07-31 23:59:59.999999999
# 2013-08 <MonthEnd> 2013-08-01 00:00:00 2013-08-31 23:59:59.999999999
# 2013-09 <MonthEnd> 2013-09-01 00:00:00 2013-09-30 23:59:59.999999999
# 2013-10 <MonthEnd> 2013-10-01 00:00:00 2013-10-31 23:59:59.999999999
# 2013-11 <MonthEnd> 2013-11-01 00:00:00 2013-11-30 23:59:59.999999999
# 2013-12 <MonthEnd> 2013-12-01 00:00:00 2013-12-31 23:59:59.999999999

np.random.seed(123456)
ps = pd.Series(np.random.randn(12), mp2013)
ps
# 2013-01 0.469112
# 2013-02 -0.282863
# 2013-03 -1.509059
# 2013-04 -1.135632
# 2013-05 1.212112
# 2013-06 -0.173215
# 2013-07 0.119209
# 2013-08 -1.044236
# 2013-09 -0.861849
# 2013-10 -2.104569
# 2013-11 -0.494929
# 2013-12 1.071804
# Freq: M, dtype: float64

时间序列的移动和滞后

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
pingana[:5]
# trade_date
# 2018-01-02 13.70
# 2018-01-03 13.33
# 2018-01-04 13.25
# 2018-01-05 13.30
# 2018-01-08 12.96
# Name: close, dtype: float64

# 将序列向后移动一天
shifted_forward = pingana.shift(1)
shifted_forward[:5]
# trade_date
# 2018-01-02 NaN
# 2018-01-03 13.70
# 2018-01-04 13.33
# 2018-01-05 13.25
# 2018-01-08 13.30
# Name: close, dtype: float64

# 将序列向前移动一天
shifted_backwards = pingana.shift(-2)[:10]
shifted_backwards[:5]
# trade_date
# 2018-01-02 13.25
# 2018-01-03 13.30
# 2018-01-04 12.96
# 2018-01-05 13.08
# 2018-01-08 13.47
# Name: close, dtype: float64

# 向后移动三秒
pingana.shift(3, freq = "S")[:5]
# trade_date
# 2018-01-02 00:00:03 13.70
# 2018-01-03 00:00:03 13.33
# 2018-01-04 00:00:03 13.25
# 2018-01-05 00:00:03 13.30
# 2018-01-08 00:00:03 12.96
# Name: close, dtype: float64

# 向后移动三天
pingana.shift(3, freq = "D")[:5]
# trade_date
# 2018-01-05 13.70
# 2018-01-06 13.33
# 2018-01-07 13.25
# 2018-01-08 13.30
# 2018-01-11 12.96
# Name: close, dtype: float64

# 计算收益率
(pingana/pingana.shift(1) - 1)[:5]
# trade_date
# 2018-01-02 NaN
# 2018-01-03 -0.027007
# 2018-01-04 -0.006002
# 2018-01-05 0.003774
# 2018-01-08 -0.025564
# Name: close, dtype: float64

时间序列的频率转换

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
sample = pingana[:2]
sample
# trade_date
# 2018-01-02 13.70
# 2018-01-03 13.33
# Name: close, dtype: float64

# 转换成小时数据
sample.asfreq("H")
# trade_date
# 2018-01-02 00:00:00 13.70
# 2018-01-02 01:00:00 NaN
# 2018-01-02 02:00:00 NaN
# ······
# 2018-01-02 21:00:00 NaN
# 2018-01-02 22:00:00 NaN
# 2018-01-02 23:00:00 NaN
# 2018-01-03 00:00:00 13.33
# Freq: H, Name: close, dtype: float64

# 转换成小时数据 + 前向填充
sample.asfreq('H', method = 'ffill')
# trade_date
# 2018-01-02 00:00:00 13.70
# 2018-01-02 01:00:00 13.70
# ······
# 2018-01-02 22:00:00 13.70
# 2018-01-02 23:00:00 13.70
# 2018-01-03 00:00:00 13.33
# Freq: H, Name: close, dtype: float64

# 转换成小时数据 + 后向填充
sample.asfreq('H', method = 'bfill')
# trade_date
# 2018-01-02 00:00:00 13.70
# 2018-01-02 01:00:00 13.33
# 2018-01-02 02:00:00 13.33
# ······
# 2018-01-02 23:00:00 13.33
# 2018-01-03 00:00:00 13.33
# Freq: H, Name: close, dtype: float64

时间序列数据的再抽样

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
pingan_cum_ret = (1 + (pingana / pingana.shift() - 1)).cumprod()
pingan_cum_ret
#> trade_date
#> 2018-01-02 NaN
#> 2018-01-03 0.972993
#> 2018-01-04 0.967153
#> ...
#> 2018-12-27 0.677372
#> 2018-12-28 0.684672
#> Name: close, Length: 243, dtype: float64

pingan_monthly_cum_ret = pingan_cum_ret.resample("M").mean()
pingan_monthly_cum_ret
#> trade_date
#> 2018-01-31 1.014633
#> 2018-02-28 0.931727
#> 2018-03-31 0.851958
#> 2018-04-30 0.825264
#> 2018-05-31 0.786297
#> 2018-06-30 0.719161
#> 2018-07-31 0.654280
#> 2018-08-31 0.683148
#> 2018-09-30 0.745601
#> 2018-10-31 0.775912
#> 2018-11-30 0.772794
#> 2018-12-31 0.728431
#> Freq: M, Name: close, dtype: float64

pingan_cum_ret['2018-01'].mean()
#> 1.0146332985749043

pingan_cum_ret.resample("M").ohlc()[:5]
#> open high low close
#> trade_date
#> 2018-01-31 0.972993 1.080292 0.945985 1.025547
#> 2018-02-28 1.024088 1.062044 0.853285 0.879562
#> 2018-03-31 0.878832 0.883942 0.794891 0.795620
#> 2018-04-30 0.781752 0.865693 0.770803 0.791971
#> 2018-05-31 0.794161 0.816058 0.735766 0.743066

by_period = pingan_cum_ret.resample("M", kind = "period").mean()
for i in by_period.index[:5]:
print("{0}:{1} {2}".format(i.start_time, i.end_time, by_period[i]))
#> 2018-01-01 00:00:00:2018-01-31 23:59:59.999999999 1.0146332985749043
#> 2018-02-01 00:00:00:2018-02-28 23:59:59.999999999 0.9317274939172748
#> 2018-03-01 00:00:00:2018-03-31 23:59:59.999999999 0.8519575315195755
#> 2018-04-01 00:00:00:2018-04-30 23:59:59.999999999 0.8252635847526362
#> 2018-05-01 00:00:00:2018-05-31 23:59:59.999999999 0.7862972793629731

sample = pingan_cum_ret[1:3]
sample
#> trade_date
#> 2018-01-03 0.972993
#> 2018-01-04 0.967153
#> Name: close, dtype: float64

by_hour = sample.resample('H').mean()
by_hour
#> trade_date
#> 2018-01-03 00:00:00 0.972993
#> 2018-01-03 01:00:00 NaN
#> 2018-01-03 02:00:00 NaN
#> ······
#> 2018-01-03 22:00:00 NaN
#> 2018-01-03 23:00:00 NaN
#> 2018-01-04 00:00:00 0.967153
#> Freq: H, Name: close, dtype: float64

# 插值填充法
by_hour.interpolate()
#> trade_date
#> 2018-01-03 00:00:00 0.972993
#> 2018-01-03 01:00:00 0.972749
#> 2018-01-03 02:00:00 0.972506
#> ······
#> 2018-01-03 21:00:00 0.967883
#> 2018-01-03 22:00:00 0.967640
#> 2018-01-03 23:00:00 0.967397
#> 2018-01-04 00:00:00 0.967153
#> Freq: H, Name: close, dtype: float64

股票的时间序列数据

获取股票数据

为了更高效的获取多个股票的数据,我们先写一个函数:

Python
1
2
3
4
5
6
7
def get(tickers, start, end):
def data(ticker):
temp_data = pro.daily(ts_code = ticker, start_date = start, end_date = end)
temp_data.index = pd.to_datetime(temp_data.trade_date)
return temp_data
datas = map(data, tickers)
return pd.concat(datas, keys = tickers, names = ['ticker', 'date'])
Python
1
2
3
4
5
6
7
8
9
10
11
12
tickers = ['000001.SZ', '000002.SZ', '000004.SZ', '000005.SZ', '000006.SZ',
'000007.SZ']
all_data = get(tickers, '20180101', '20181231')
all_data[:5]
#> ts_code trade_date ... vol amount
#> ticker date ...
#> 000001.SZ 2018-12-28 000001.SZ 20181228 ... 576604.00 541571.004
#> 2018-12-27 000001.SZ 20181227 ... 624593.27 586343.755
#> 2018-12-26 000001.SZ 20181226 ... 421140.60 393215.140
#> 2018-12-25 000001.SZ 20181225 ... 586615.45 545235.607
#> 2018-12-24 000001.SZ 20181224 ... 509117.67 477186.904
#> [5 rows x 11 columns]

金融时间序列的可视化

收盘价的走势:

Python
1
2
3
4
5
6
7
8
9
10
11
12
import matplotlib.pyplot as plt
just_closing_prices = all_data[['close']].reset_index()
just_closing_prices[:5]
daily_close_px = just_closing_prices.pivot('date', 'ticker', 'close')
daily_close_px[:5]
plt.grid(linewidth = 0.2)
plt.plot(daily_close_px['000001.SZ'])
plt.title('平安银行收盘价走势图', fontproperties = cnfont, fontsize = 20)
plt.xlabel('date', fontproperties = enfont)
plt.ylabel('Closing prices', fontproperties = enfont)
plt.savefig('pinganbank.svg')
plt.show()

Python
1
2
3
4
5
6
7
8
plt.figure(figsize = (8, 6))
plt.grid(linewidth = 0.2)
plt.plot(daily_close_px)
plt.title("深交所部分上市公司2018年股价走势", fontproperties = cnfont, fontsize = 20)
plt.legend(['000001.SZ', '000002.SZ', '000004.SZ', '000005.SZ', '000006.SZ',
'000007.SZ'], loc = 'best')
plt.savefig("szm.svg")
plt.show()

绘制交易量:

Python
1
2
3
4
5
pinganv = all_data.vol.loc['000001.SZ']
plt.bar(pinganv.index, pinganv)
plt.title('平安银行2018年的交易量', fontproperties = cnfont, fontsize = 18)
plt.savefig("pinganvol.svg")
plt.show()

合并价格和交易量:

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
top = plt.subplot2grid((4, 4), (0, 0), rowspan = 3, colspan = 4)
top.plot(daily_close_px.index,
daily_close_px['000001.SZ'],
label = 'Closing Price')
plt.title('平安银行 2018 年日收盘价', fontproperties = cnfont, fontsize = 16)
plt.legend(loc = 2)
bottom = plt.subplot2grid((4, 4), (3, 0), rowspan = 1, colspan = 4)
bottom.bar(pinganv.index, pinganv)
plt.title('平安银行 2018 年日交易量', fontproperties = cnfont, fontsize = 16)
plt.gcf().set_size_inches(12, 8)
plt.subplots_adjust(hspace = 0.75)
plt.savefig("平安银行收盘价和交易量走势图.svg")
plt.show()

绘制蜡烛图

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
subset = all_data.loc['000001.SZ'].loc['2018-12':'2018-12'].reset_index()
subset = subset.sort_values('date')
subset = subset.reset_index()
# 删除 index 列
subset = subset.drop('index', axis = 1)
subset[:5]

# 在subset数据框中生成一列数值型的日期
import matplotlib.dates as mdates
subset['date_num'] = subset['date'] \
.apply(lambda date: mdates.date2num(date.to_pydatetime()))
subset[:5]

# 把 subset 变成tuple
subset_as_tuple = [tuple(x) for x in subset[['date_num', 'open', 'high',
'low', 'close']].values]
subset_as_tuple

from matplotlib.dates import DateFormatter
from matplotlib.dates import (WeekdayLocator, MONDAY)
from mpl_finance import candlestick_ohlc
week_formatter = DateFormatter('%Y-%m-%d')
mondays = WeekdayLocator(MONDAY) # major ticks on the mondays
plt.figure(figsize = (12, 8))
fig, ax = plt.subplots()
ax.xaxis.set_major_locator(mondays)
ax.xaxis.set_major_formatter(week_formatter)
candlestick_ohlc(ax, subset_as_tuple, width = 0.6, colorup = 'r', colordown='g')
plt.savefig('candlestick.svg')
plt.show()

一些金融指标的计算

简单日收益率

首先算一天的:

Python
1
2
3
4
5
pa_p_t0 = daily_close_px.iloc[0]['000001.SZ'] # 第一天的收盘价
pa_p_t1 = daily_close_px.iloc[1]['000001.SZ'] # 第二天的收盘价
r_t1 = pa_p_t1 / pa_p_t0 - 1 # 简单日收益率
pa_p_t0, pa_p_t1, r_t1
#> (13.7, 13.33, -0.02700729927007295)

然后是算所有的,具体来说,至少有下面三种方法:

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 方法1:
dpc_1 = daily_close_px.iloc[1:] / \
daily_close_px[:-1].values - 1
dpc_1.loc[:, '000001.SZ':'000004.SZ'][:5]

# 方法2:
price_matrix_minus_day1 = daily_close_px.iloc[1:]
price_matrix_minus_day1[:5]
daily_close_px.iloc[:-1].values
dpc_2 = daily_close_px / daily_close_px.shift(1) - 1
dpc_2.iloc[:, 0:2][:5]

# 方法3:
daily_pct_change = daily_close_px.pct_change()
daily_pct_change.iloc[:, 0:2][:5]

计算简单日累积收益率

Python
1
2
3
4
5
6
cum_daily_return = (1 + daily_pct_change).cumprod()
cum_daily_return.iloc[:, :2][:5]
cum_daily_return.plot(figsize = (12, 8))
plt.legend(loc = 'best')
plt.savefig('cumprod.svg')
plt.show()

分析收益率的分布

直方图

Python
1
2
3
4
pingan = daily_pct_change['000001.SZ']
pingan.hist(bins = 50, figsize = (6, 4))
plt.savefig('pinganhist.svg')
plt.show()

描述性统计

Python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 描述性统计
pingan.describe()
#> open high ... vol amount
#> count 243.000000 243.000000 ... 2.430000e+02 2.430000e+02
#> mean 10.794774 10.946584 ... 1.138418e+06 1.280897e+06
#> std 1.438035 1.482004 ... 5.980727e+05 8.337155e+05
#> min 8.600000 8.700000 ... 2.990896e+05 3.053892e+05
#> 25% 9.960000 10.090000 ... 6.901344e+05 7.064880e+05
#> 50% 10.570000 10.670000 ... 9.949177e+05 1.042529e+06
#> 75% 11.480000 11.610000 ... 1.367096e+06 1.547458e+06
#> max 14.800000 15.130000 ... 3.345717e+06 4.521587e+06
#> [8 rows x 9 columns]
# 指定分位数的描述性统计
pingan.describe(percentiles = [0.025, 0.5, 0.975])
#> open high ... vol amount
#> count 243.000000 243.000000 ... 2.430000e+02 2.430000e+02
#> mean 10.794774 10.946584 ... 1.138418e+06 1.280897e+06
#> std 1.438035 1.482004 ... 5.980727e+05 8.337155e+05
#> min 8.600000 8.700000 ... 2.990896e+05 3.053892e+05
#> 2.5% 8.730000 8.831000 ... 4.317970e+05 3.937381e+05
#> 50% 10.570000 10.670000 ... 9.949177e+05 1.042529e+06
#> 97.5% 14.325000 14.565000 ... 2.655756e+06 3.633853e+06
#> max 14.800000 15.130000 ... 3.345717e+06 4.521587e+06
#> [8 rows x 9 columns]
Python
1
2
3
daily_pct_change.hist(bins = 50, sharex = True, figsize = (12, 8))
plt.savefig("szmhist.svg")
plt.show()

QQ 图

Python
1
2
3
4
5
6
import scipy.stats as stats
f = plt.figure(figsize = (6, 4))
ax = f.add_subplot(111)
stats.probplot(pingan, dist = 'norm', plot = ax)
plt.savefig('pinganqqplot.svg')
plt.show()

箱线图

Python
1
2
3
daily_pct_change.plot(kind = 'box')
plt.savefig('szmbox.svg')
plt.show()

比较两只股票的日收益率

为了方便,我们先编写一个绘图函数:

Python
1
2
3
4
5
6
7
8
9
10
11
def render_scatter_plot(data, x_stock_name, y_stock_name, xlim = None, ylim = None):
fig = plt.figure(figsize = (12, 8))
ax = fig.add_subplot(111)
ax.scatter(data[x_stock_name], data[y_stock_name])
if xlim is not None: ax.set_xlim(xlim)
ax.autoscale(False)
ax.vlines(0, -10, 10)
ax.hlines(0, -10, 10)
ax.plot((-10, 10), (-10, 10))
ax.set_xlabel(x_stock_name)
ax.set_ylabel(y_stock_name)
Python
1
2
3
4
limits = [-0.15, 0.15]
render_scatter_plot(daily_pct_change, '000001.SZ', '000002.SZ', xlim = limits)
plt.savefig('pinganandwankescatter.svg')
plt.show()

Python
1
2
3
render_scatter_plot(daily_pct_change, '000004.SZ', '000006.SZ')
plt.savefig('4and6scatter.svg')
plt.show()

所有的股票:

Python
1
2
3
pd.plotting.scatter_matrix(daily_pct_change, diagonal = 'kde', alpha = 0.1, figsize = (12, 8))
plt.savefig('plotmatrix.svg')
plt.show()

滚动窗口

计算 MA

Python
1
2
3
4
5
6
7
8
9
pingan = all_data.loc['000001.SZ'].loc['2018']['close']
pingan = pingan.sort_index()
pingan[:5]
pingan.plot()
pingan.rolling(5).mean().plot()
pingan.rolling(10).mean().plot()
pingan.rolling(20).mean().plot()
plt.savefig('pinganwithma.svg')
plt.show()

均值回复

Python
1
2
3
4
mean_abs_dev = lambda x: np.fabs(x - x.mean()).mean()
pingan.rolling(5).apply(mean_abs_dev).plot()
plt.savefig('pingandemean.svg')
plt.show()

拓展窗口的计算

拓展窗口是指,下一轮计算比本轮计算多了一个观测值(窗口左边不动,右边向右移动一个观测值)。

Python
1
2
3
4
5
expanding_mean = lambda x: x.rolling(len(x), min_periods = 1).mean()
pingan.plot()
expanding_mean(pingan).plot()
plt.savefig('expanding.svg')
plt.show()

波动率的计算

Python
1
2
3
4
5
6
7
# 这里使用22天的滚动窗口标准差作为波动率的度量
min_periods = 22
vol = daily_pct_change.rolling(min_periods).std() * np.sqrt(min_periods)
vol.plot()
plt.title("深交所部分股票的收益率标准差", fontproperties = cnfont)
plt.savefig("szmvol.svg")
plt.show()

Python
1
2
3
4
5
6
7
# 如果使用已实现波动率进行度量
rv = lambda x: np.sum(pow(x, 2))
vol2 = daily_pct_change.rolling(min_periods).apply(rv)
vol2.plot()
plt.title("深交所部分股票的已实现波动率", fontproperties = cnfont)
plt.savefig("szmvol2.svg")
plt.show()

收益率的滚动窗口相关系数

Python
1
2
3
4
rolling_corr = daily_pct_change['000001.SZ'].rolling(22).corr(daily_pct_change['000002.SZ']).dropna()
rolling_corr.plot()
plt.savefig('1and2corr.svg')
plt.show()

和深证 100 指数比较

Python
1
2
3
4
5
6
7
8
9
sz100 = pro.index_daily(ts_code = '399004.SZ', start_date = '20180101', end_date = '20181231')
sz100.index = pd.to_datetime(sz100.trade_date)
sz100 = sz100.sort_index()
sz100_dpc = sz100['close'].pct_change().fillna(0)
sz100_dpc[:5]
dpc_all = pd.concat([sz100_dpc, daily_pct_change], axis = 1)
dpc_all.rename(columns = {'close': '399004.SZ'}, inplace = True)
dpc_all = dpc_all.fillna(0)
dpc_all[:5]

累积收益率

Python
1
2
cdr_all = (1 + dpc_all).cumprod()
cdr_all[:5]

相关系数矩阵

Python
1
2
3
4
dpc_corr = dpc_all.corr()
dpc_corr
# 我们感兴趣的是其它股票和深证100R指数的相关系数
dpc_corr.loc['399004.SZ']

累积收益率的走势

Python
1
2
3
4
cdr_all.plot()
plt.legend(loc = 'best')
plt.savefig('cumprodszm.svg')
plt.show()

Python
1
2
3
render_scatter_plot(dpc_all, '000001.SZ', '399004.SZ')
plt.savefig('pinganandsz100.svg')
plt.show()

# Python

Comments

Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×