本数据来源于2019新型冠状病毒疫情时间序列数据仓库,其数据来源为丁香园。
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import data_util
import plot_util
from IPython.display import display, Markdown
raw_data = data_util.load_data()
全国数据是数值较大、自然增长、概率分布、应不受人为干预的数据,所以它应该满足本福特定律(Benford's Law, First-Digit Law),即数据首位数字越小它出现概率越高。比如首位数字是1的概率比9高很多。
本福特定律只是数据真实的必要不充分条件,如果只公布部分数据,且已公布数据也恰好符合上面条件,则可以规避开本数据校验。
raw_city_confirmed = raw_data['city_confirmedCount']
benford_raw = data_util.benford(raw_city_confirmed)
figure = plot_util.plot_bar(benford_raw, '全国数据校验(本福特定律)', 'Digit', 'Percent')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_5_0.png)
display(Markdown('## ' + city_name + '数据'))
raw_data = data_util.load_data()
display(Markdown('### ' + city_name + '累计数量'))
city_daily_data = data_util.aggregate_daily(raw_data, city_name)
city_daily_data = data_util.calculate_dead_cured_rate(city_daily_data)
city_daily_data.tail(5)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
provinceName |
provinceEnglishName |
province_zipCode |
cityName |
cityEnglishName |
city_zipCode |
confirmed |
suspected |
cured |
dead |
updateTime |
updateDate |
dead_rate |
cured_rate |
5580 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
416 |
0 |
199 |
2 |
2020-02-20 19:07:19.834 |
2020-02-20 |
0.480769 |
47.836538 |
4495 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
416 |
0 |
222 |
2 |
2020-02-21 18:12:13.066 |
2020-02-21 |
0.480769 |
53.365385 |
2753 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
417 |
0 |
226 |
2 |
2020-02-22 18:01:40.406 |
2020-02-22 |
0.479616 |
54.196643 |
1065 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
417 |
0 |
237 |
3 |
2020-02-23 19:19:02.938 |
2020-02-23 |
0.719424 |
56.834532 |
32 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
417 |
0 |
249 |
3 |
2020-02-24 18:16:54.754 |
2020-02-24 |
0.719424 |
59.712230 |
figure = plot_util.plot_conf_main(city_daily_data, city_name + '累计')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_11_0.png)
display(Markdown('### ' + city_name + '增长速度'))
city_daily_data_1st_derivative = data_util.diff(city_daily_data)
city_daily_data_1st_derivative.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
provinceName |
provinceEnglishName |
province_zipCode |
cityName |
cityEnglishName |
city_zipCode |
confirmed |
suspected |
cured |
dead |
updateTime |
updateDate |
dead_rate |
cured_rate |
5580 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
17.0 |
0.0 |
2020-02-20 19:07:19.834 |
2020-02-20 |
0.000000 |
4.086538 |
4495 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
23.0 |
0.0 |
2020-02-21 18:12:13.066 |
2020-02-21 |
0.000000 |
5.528846 |
2753 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
1.0 |
0.0 |
4.0 |
0.0 |
2020-02-22 18:01:40.406 |
2020-02-22 |
-0.001153 |
0.831258 |
1065 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
11.0 |
1.0 |
2020-02-23 19:19:02.938 |
2020-02-23 |
0.239808 |
2.637890 |
32 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
12.0 |
0.0 |
2020-02-24 18:16:54.754 |
2020-02-24 |
0.000000 |
2.877698 |
figure = plot_util.plot_conf_main(city_daily_data_1st_derivative, city_name + '增长速度')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_14_0.png)
display(Markdown('### ' + city_name + '增长加速度'))
city_daily_data_2nd_derivative = data_util.diff(city_daily_data_1st_derivative)
city_daily_data_2nd_derivative.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
provinceName |
provinceEnglishName |
province_zipCode |
cityName |
cityEnglishName |
city_zipCode |
confirmed |
suspected |
cured |
dead |
updateTime |
updateDate |
dead_rate |
cured_rate |
5580 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
-2.0 |
0.0 |
2020-02-20 19:07:19.834 |
2020-02-20 |
0.000000 |
-0.480769 |
4495 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
6.0 |
0.0 |
2020-02-21 18:12:13.066 |
2020-02-21 |
0.000000 |
1.442308 |
2753 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
1.0 |
0.0 |
-19.0 |
0.0 |
2020-02-22 18:01:40.406 |
2020-02-22 |
-0.001153 |
-4.697588 |
1065 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
-1.0 |
0.0 |
7.0 |
1.0 |
2020-02-23 19:19:02.938 |
2020-02-23 |
0.240961 |
1.806632 |
32 |
广东省 |
Guangdong |
440000 |
深圳 |
Shenzhen |
440300.0 |
0.0 |
0.0 |
1.0 |
-1.0 |
2020-02-24 18:16:54.754 |
2020-02-24 |
-0.239808 |
0.239808 |
figure = plot_util.plot_conf_main(city_daily_data_2nd_derivative, city_name + '增长加速度')
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py:211: RuntimeWarning: Glyph 8722 missing from current font.
font.set_text(s, 0.0, flags=flags)
/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/matplotlib/backends/backend_agg.py:180: RuntimeWarning: Glyph 8722 missing from current font.
font.set_text(s, 0, flags=flags)
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_17_1.png)
display(Markdown('### ' + city_name + '死亡治愈率'))
figure = plot_util.plot_conf_dead_cured_ratio(city_daily_data, city_name + '死亡治愈率%')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_19_0.png)
black_province_name = '湖北省'
display(Markdown('## 全国数据(除' + black_province_name + ')'))
display(Markdown('因' + black_province_name + '灾情特别严重且现已隔离(' + black_province_name + '加油),它的数据可能和全国其他地区有较大差别。为更精确预计其他地区的未来发展趋势,这里考虑排除其以外的全国其他地区情况。'))
因湖北省灾情特别严重且现已隔离(湖北省加油),它的数据可能和全国其他地区有较大差别。为更精确预计其他地区的未来发展趋势,这里考虑排除其以外的全国其他地区情况。
display(Markdown('## 全国累计(除' + black_province_name + ')'))
white_daily_data = data_util.aggregate_daily_except(raw_data, province_name=black_province_name)
white_daily_data = data_util.calculate_dead_cured_rate(white_daily_data)
figure = plot_util.plot_conf_main(white_daily_data, '全国累计(除' + black_province_name + ')')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_25_0.png)
display(Markdown('## 全国增长速度(除' + black_province_name + ')'))
white_daily_data_1st_derivative = data_util.diff(white_daily_data)
white_daily_data_1st_derivative.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
province_zipCode |
city_zipCode |
confirmed |
suspected |
cured |
dead |
dead_rate |
cured_rate |
updateDate |
|
|
|
|
|
|
|
|
2020-02-20 |
151280000 |
139543970.0 |
25.0 |
0.0 |
572.0 |
2.0 |
0.014783 |
4.479124 |
2020-02-21 |
153240000 |
140806270.0 |
476.0 |
0.0 |
610.0 |
4.0 |
0.008554 |
2.937501 |
2020-02-22 |
140740000 |
129510470.0 |
-348.0 |
0.0 |
418.0 |
6.0 |
0.064324 |
4.700671 |
2020-02-23 |
152910000 |
141346370.0 |
196.0 |
0.0 |
583.0 |
-2.0 |
-0.025925 |
3.689261 |
2020-02-24 |
130300000 |
123221439.0 |
-456.0 |
0.0 |
71.0 |
-1.0 |
0.015976 |
2.751535 |
figure = plot_util.plot_conf_main(white_daily_data_1st_derivative, '全国增长速度(除' + black_province_name + ')')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_28_0.png)
display(Markdown('## 全国增长加速度(除' + black_province_name + ')'))
white_daily_data_2nd_derivative = data_util.diff(white_daily_data_1st_derivative)
white_daily_data_2nd_derivative.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
province_zipCode |
city_zipCode |
confirmed |
suspected |
cured |
dead |
dead_rate |
cured_rate |
updateDate |
|
|
|
|
|
|
|
|
2020-02-20 |
151280000 |
139543970.0 |
-29.0 |
0.0 |
57.0 |
-4.0 |
-0.030846 |
0.530595 |
2020-02-21 |
153240000 |
140806270.0 |
451.0 |
0.0 |
38.0 |
2.0 |
-0.006229 |
-1.541622 |
2020-02-22 |
140740000 |
129510470.0 |
-824.0 |
0.0 |
-192.0 |
2.0 |
0.055770 |
1.763170 |
2020-02-23 |
152910000 |
141346370.0 |
544.0 |
0.0 |
165.0 |
-8.0 |
-0.090249 |
-1.011410 |
2020-02-24 |
130300000 |
123221439.0 |
-652.0 |
0.0 |
-512.0 |
1.0 |
0.041901 |
-0.937726 |
figure = plot_util.plot_conf_main(white_daily_data_2nd_derivative, '全国增长加速度(除' + black_province_name + ')')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_31_0.png)
display(Markdown('## 全国增死亡治愈率(除' + black_province_name + ')'))
figure = plot_util.plot_conf_dead_cured_ratio(white_daily_data, '全国增死亡治愈率%(除' + black_province_name + ')')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_33_0.png)
display(Markdown('## ' + city_name + '数据'))
raw_data = data_util.load_data()
display(Markdown('### ' + city_name + '累计数量'))
city_daily_data = data_util.aggregate_daily(raw_data, city_name)
city_daily_data = data_util.calculate_dead_cured_rate(city_daily_data)
city_daily_data.tail(5)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
provinceName |
provinceEnglishName |
province_zipCode |
cityName |
cityEnglishName |
city_zipCode |
confirmed |
suspected |
cured |
dead |
updateTime |
updateDate |
dead_rate |
cured_rate |
5336 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
45027 |
0 |
5598 |
1585 |
2020-02-20 22:08:31.581 |
2020-02-20 |
3.520110 |
12.432540 |
4068 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
45346 |
0 |
6281 |
1684 |
2020-02-21 20:57:17.698 |
2020-02-21 |
3.713668 |
13.851277 |
2545 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
45660 |
0 |
7292 |
1774 |
2020-02-22 20:55:28.727 |
2020-02-22 |
3.885239 |
15.970215 |
928 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
46201 |
0 |
8189 |
1856 |
2020-02-23 20:06:03.658 |
2020-02-23 |
4.017229 |
17.724725 |
0 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
46607 |
0 |
8950 |
1987 |
2020-02-24 19:12:21.027 |
2020-02-24 |
4.263308 |
19.203124 |
figure = plot_util.plot_conf_main(city_daily_data, city_name + '累计')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_39_0.png)
display(Markdown('### ' + city_name + '增长速度'))
city_daily_data_1st_derivative = data_util.diff(city_daily_data)
city_daily_data_1st_derivative.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
provinceName |
provinceEnglishName |
province_zipCode |
cityName |
cityEnglishName |
city_zipCode |
confirmed |
suspected |
cured |
dead |
updateTime |
updateDate |
dead_rate |
cured_rate |
5336 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
615.0 |
0.0 |
558.0 |
88.0 |
2020-02-20 22:08:31.581 |
2020-02-20 |
0.149400 |
1.084256 |
4068 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
319.0 |
0.0 |
683.0 |
99.0 |
2020-02-21 20:57:17.698 |
2020-02-21 |
0.193558 |
1.418736 |
2545 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
314.0 |
0.0 |
1011.0 |
90.0 |
2020-02-22 20:55:28.727 |
2020-02-22 |
0.171570 |
2.118938 |
928 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
541.0 |
0.0 |
897.0 |
82.0 |
2020-02-23 20:06:03.658 |
2020-02-23 |
0.131990 |
1.754510 |
0 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
406.0 |
0.0 |
761.0 |
131.0 |
2020-02-24 19:12:21.027 |
2020-02-24 |
0.246079 |
1.478399 |
figure = plot_util.plot_conf_main(city_daily_data_1st_derivative, city_name + '增长速度')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_42_0.png)
display(Markdown('### ' + city_name + '增长加速度'))
city_daily_data_2nd_derivative = data_util.diff(city_daily_data_1st_derivative)
city_daily_data_2nd_derivative.tail()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
provinceName |
provinceEnglishName |
province_zipCode |
cityName |
cityEnglishName |
city_zipCode |
confirmed |
suspected |
cured |
dead |
updateTime |
updateDate |
dead_rate |
cured_rate |
5336 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
-1045.0 |
0.0 |
-229.0 |
-28.0 |
2020-02-20 22:08:31.581 |
2020-02-20 |
0.008947 |
-0.315955 |
4068 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
-296.0 |
0.0 |
125.0 |
11.0 |
2020-02-21 20:57:17.698 |
2020-02-21 |
0.044159 |
0.334480 |
2545 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
-5.0 |
0.0 |
328.0 |
-9.0 |
2020-02-22 20:55:28.727 |
2020-02-22 |
-0.021988 |
0.700201 |
928 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
227.0 |
0.0 |
-114.0 |
-8.0 |
2020-02-23 20:06:03.658 |
2020-02-23 |
-0.039580 |
-0.364428 |
0 |
湖北省 |
Hubei |
420000 |
武汉 |
Wuhan |
420100.0 |
-135.0 |
0.0 |
-136.0 |
49.0 |
2020-02-24 19:12:21.027 |
2020-02-24 |
0.114089 |
-0.276111 |
figure = plot_util.plot_conf_main(city_daily_data_2nd_derivative, city_name + '增长加速度')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_45_0.png)
display(Markdown('### ' + city_name + '死亡治愈率'))
figure = plot_util.plot_conf_dead_cured_ratio(city_daily_data, city_name + '死亡治愈率%')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_47_0.png)
display(Markdown('### ' + city_name + '数据校验'))
city_confirmed = city_daily_data['confirmed']
benford_raw = data_util.benford(city_confirmed)
figure = plot_util.plot_bar(benford_raw, city_name + '数据校验(本福特定律)', 'Digit', 'Percent')
![png](https://raw.githubusercontent.com/donaldwuid/py_ncov2019/master/./Readme/output_49_0.png)