数据获取

import pandas as pd
import urllib.request
import tempfile
import shutil
import zipfile
import matplotlib
import numpy as np
from matplotlib import pyplot as plt

# 获取数据
temp_dir = tempfile.mkdtemp()
data_source = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
zipname = temp_dir + '/Bike-Sharing-Dataset.zip'
urllib.request.urlretrieve(data_source, zipname)

# 解压
zip_ref = zipfile.ZipFile(zipname, 'r')
zip_ref.extractall(temp_dir)
zip_ref.close()

# 读取数据
daily_path = temp_dir + '/day.csv'
daily_data = pd.read_csv(daily_path)
# 把字符串数据传换成日期数据
daily_data['dteday'] = pd.to_datetime(daily_data['dteday'])
# 不关注的列
drop_list = ['instant', 'season', 'yr', 'mnth', 'holiday', 'workingday', 'weathersit', 'atemp', 'hum']
daily_data.drop(drop_list, inplace=True, axis=1)
shutil.rmtree(temp_dir)
Attribute Information:

Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv

- instant: record index
- dteday : date
- season : season (1:springer, 2:summer, 3:fall, 4:winter)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
+ weathersit : 
- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
- temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- hum: Normalized humidity. The values are divided to 100 (max)
- windspeed: Normalized wind speed. The values are divided to 67 (max)
- casual: count of casual users
- registered: count of registered users
- cnt: count of total rental bikes including both casual and registered

参数配置

# 设置图片尺寸 7" x 4"
matplotlib.rc('figure', figsize=(7, 4))
# 设置字体 7
matplotlib.rc('font', size=7)
# 不显示顶部和右侧的坐标线
matplotlib.rc('axes.spines', top=False, right=False)
# 不显示网格
matplotlib.rc('axes', grid=False)
# 设置背景颜色是白色
matplotlib.rc('axes', facecolor='white')

散点图

# 包装一个散点图的函数便于复用
def scatterplot(x_data, y_data, x_label, y_label, title):
    # 创建一个绘图对象
    fig, ax = plt.subplots()

    # 设置数据、点的大小、点的颜色和透明度
    # http://www.114la.com/other/rgb.htm  ax.scatter(x_data, y_data, s=10, color='#539caf', alpha=0.75)

    # 添加标题和坐标说明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 绘制散点图
scatterplot(x_data=daily_data['temp'],
            y_data=daily_data['cnt'],
            x_label='Normalized temperature (C)',
            y_label='Check outs',
            title='Number of Check Outs vs Temperature')

76cccae51e484ac998be143b1fe75e15-image.png

曲线图

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

# 线性回归增加常数项 y=kx+b
x = sm.add_constant(daily_data['temp'])
y = daily_data['cnt']
# 普通最小二乘模型,ordinary least square model
regr = sm.OLS(y, x)
res = regr.fit()
# 从模型获得拟合数据
# 置信水平alpha=5%,st数据汇总,data数据详情,ss2数据列名
st, data, ss2 = summary_table(res, alpha=0.05)
fitted_values = data[:, 2]

# 包装曲线绘制函数
def lineplot(x_data, y_data, x_label, y_label, title):
    # 创建绘图对象
    _, ax = plt.subplots()

    # 绘制拟合曲线,lw=linewidth,alpha=透明度
    ax.plot(x_data, y_data, lw=2, color='#539caf', alpha=1)

    # 添加标题和坐标说明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

# 调用绘图函数
lineplot(x_data=daily_data['temp'],
         y_data=fitted_values,
         x_label='Normalized temperature (C)',
         y_label='Check outs',
         title='Line of Best Fit for Number of Check Outs vs Temperature')

033f6563d5ac44f4a120c55bbc9d75f1-image.png

带置信区间的曲线图

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import summary_table

# 线性回归增加常数项 y=kx+b
x = sm.add_constant(daily_data['temp'])
y = daily_data['cnt']
# 普通最小二乘模型,ordinary least square model
regr = sm.OLS(y, x)
res = regr.fit()
# 从模型获得拟合数据
# 置信水平alpha=5%,st数据汇总,data数据详情,ss2数据列名
st, data, ss2 = summary_table(res, alpha=0.05)
fitted_values = data[:, 2]

# 获得5%置信区间的上下界
predict_mean_ci_low, predict_mean_ci_upp = data[:, 4:6].T

# 创建置信区间DataFrame,上下界
CI_df = pd.DataFrame(columns=['x_data', 'low_CI', 'upper_CI'])
CI_df['x_data'] = daily_data['temp']
CI_df['low_CI'] = predict_mean_ci_low
CI_df['upper_CI'] = predict_mean_ci_upp
# 根据x_data进行排序
CI_df.sort_values('x_data', inplace=True)

# 绘制置信区间
def lineplotCI(x_data, y_data, sorted_x, low_CI, upper_CI, x_label, y_label, title):
    # 创建绘图对象
    _, ax = plt.subplots()

    # 绘制预测曲线
    ax.plot(x_data, y_data, lw=1, color='#539caf', alpha=1, label='Fit')
    # 绘制置信区间,顺序填充
    ax.fill_between(sorted_x, low_CI, upper_CI, color='#539caf', alpha=0.4, label='95% CI')
    # 添加标题和坐标说明
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)

    # 显示图例,配合label参数,loc=“best”自适应方式
    ax.legend(loc='best')

# 调用绘图函数
lineplotCI(x_data=daily_data['temp'],
    y_data=fitted_values,
    sorted_x=CI_df['x_data'],
    low_CI=CI_df['low_CI'],
    upper_CI=CI_df['upper_CI'],
    x_label='Normalized temperature (C)',
    y_label='Check outs',
    title='Line of Best Fit for Number of Check Outs vs Temperature')

a15b18809de84a23bc2d7c257845f7dd-image.png

双坐标曲线图

# 双纵坐标绘图函数
def lineplot2y(x_data, x_label, y1_data, y1_color, y1_label, y2_data, y2_color, y2_label, title):
    _, ax1 = plt.subplots()
    ax1.plot(x_data, y1_data, color=y1_color)
    # 添加标题和坐标说明
    ax1.set_ylabel(y1_label, color=y1_color)
    ax1.set_xlabel(x_label)
    ax1.set_title(title)

    # 两个绘图对象共享横坐标轴
    ax2 = ax1.twinx()
    ax2.plot(x_data, y2_data, color=y2_color)
    ax2.set_ylabel(y2_label, color=y2_color)
    # 右侧坐标轴可见
    ax2.spines['right'].set_visible(True)

# 调用绘图函数
lineplot2y(x_data=daily_data['dteday'],
    x_label='Day',
    y1_data=daily_data['cnt'],
    y1_color='#539caf',
    y1_label='Check outs',
    y2_data=daily_data['windspeed'],
    y2_color='#7663b0',
    y2_label='Normalized windspeed',
    title='Check Outs and Windspeed Over Time')

903b8397a0774ccb9278fc29cca449bc-image.png

灰度图

# 绘制灰度图的函数
def histogram(data, x_label, y_label, title):
    _, ax = plt.subplots()
    # 设置bin的数量
    ax.hist(data, color='#539caf', bins=10)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 绘图函数调用
histogram(data=daily_data['registered'],
    x_label='Check outs',
    y_label='Frequency',
    title='Distribution of Registered Check Outs')

eca194ebfcb34dc8b566673486df8b40-image.png

堆叠直方图

# 绘制堆叠的直方图
def overlaid_historgram(data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
    # 归一化数据区间,对齐两个直方图的bins
    max_nbins = 10
    data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
    binwidth = (data_range[1] - data_range[0]) / max_nbins
    bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth)

    # 创建绘图对象
    _, ax = plt.subplots()
    ax.hist(data1, bins=bins, color=data1_color, alpha=1, label=data1_name)
    ax.hist(data2, bins=bins, color=data2_color, alpha=0.75, label=data2_name)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc='best')

# 绘图函数调用
overlaid_historgram(data1=daily_data['registered'],
    data1_name='Registered',
    data1_color='#539caf',
    data2=daily_data['casual'],
    data2_name='Casual',
    data2_color='#7663b0',
    x_label='Check outs',
    y_label='Frequency',
    title='Distribution of Check Outs By Type')

66805fce28b34edda1deb938e257f799-image.png

密度估计曲线

# 计算概率密度
from scipy.stats import gaussian_kde
data = daily_data['registered']
# kernal density estimate: https://en.wikipedia.org/wiki/Kernel_density_estimation
density_est = gaussian_kde(data)
# 控制平滑程度,数值越大,越平滑
density_est.covariance_factor = lambda: .3
density_est._compute_covariance()
x_data = np.arange(min(data), max(data), 200)

# 绘制密度估计曲线
def densityplot(x_data, density_est, x_label, y_label, title):
    _, ax = plt.subplots()
    ax.plot(x_data, density_est(x_data), color='#539caf', lw=2)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 调用绘图函数
densityplot(x_data=x_data,
  density_est=density_est,
  x_label='Check outs',
  y_label='Frequency',
  title='Distribution of Registered Check Outs')

edbe7de1e00740158b670982c060927d-image.png

柱状图

# 分天分析统计特征
mean_total_co_day = daily_data[['weekday', 'cnt']].groupby('weekday').agg([np.mean, np.std])
mean_total_co_day.columns = mean_total_co_day.columns.droplevel()

# 定义绘制柱状图的函数
def barplot(x_data, y_data, error_data, x_label, y_label, title):
    _, ax = plt.subplots()
    # 柱状图
    ax.bar(x_data, y_data, color='#539caf', align='center')
    # 绘制方差
 # ls='none'去掉bar之间的连线  ax.errorbar(x_data, y_data, yerr=error_data, color='#297083', ls='none', lw=5)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 绘图函数调用
barplot(x_data=mean_total_co_day.index.values,
    y_data=mean_total_co_day['mean'],
    error_data=mean_total_co_day['std'],
    x_label='Day of week',
    y_label='Check outs',
    title='Total Check Outs By Day Of Week (0 = Sunday)')

25dab497aafd4a1892e5bb0c1c2a3a5f-image.png

堆积柱状图

# 分天统计注册和偶然使用的情况
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天统计注册和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']

# 绘制堆积柱状图
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 循环绘制堆积柱状图
    for i in range(0, len(y_data_list)):
        if i == 0:
            ax.bar(x_data, y_data_list[i], color=colors[i], align='center', label=y_data_names[i])
        else:
            # 采用堆积的方式,除了第一个分类,后面的分类都从前一个分类的柱状图接着画
 # 用归一化保证最终累积结果为1  ax.bar(x_data, y_data_list[i], color=colors[i], bottom=y_data_list[i-1], align='center', label=y_data_names[i])
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    # 设定图例位置
    ax.legend(loc='upper right')

# 调用绘图函数
stackedbarplot(x_data=mean_by_reg_co_day.index.values,
    y_data_list=[mean_by_reg_co_day['reg_prop'], mean_by_reg_co_day['casual_prop']],
    y_data_names=['Registered', 'Casual'],
    colors=['#539caf', '#7663b0'],
    x_label='Day of week',
    y_label='Proportion of check outs',
    title='Check Outs By Registration Status and Day of Week (0 = Sunday)')

e947382ba99048bba1d5181b5b167fef-image.png

分组柱状图

# 分天统计注册和偶然使用的情况
mean_by_reg_co_day = daily_data[['weekday', 'registered', 'casual']].groupby('weekday').mean()
# 分天统计注册和偶然使用的占比
mean_by_reg_co_day['total'] = mean_by_reg_co_day['registered'] + mean_by_reg_co_day['casual']
mean_by_reg_co_day['reg_prop'] = mean_by_reg_co_day['registered'] / mean_by_reg_co_day['total']
mean_by_reg_co_day['casual_prop'] = mean_by_reg_co_day['casual'] / mean_by_reg_co_day['total']

# 绘制分组柱状图的函数
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 设置每一组柱状图的宽度
  total_width = 0.8
  # 设置每一个柱状图的宽度
  ind_width = total_width / len(y_data_list)
    # 计算每一个柱状图的中心偏移
  alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)

    # 分别绘制每一个柱状图
  for i in range(0, len(y_data_list)):
        # 横向散开绘制
  ax.bar(x_data + alteration[i], y_data_list[i], color=colors[i], label=y_data_names[i], width=ind_width)
        ax.set_ylabel(y_label)
        ax.set_xlabel(x_label)
        ax.set_title(title)
        ax.legend(loc='upper right')

# 调用绘图函数
groupedbarplot(x_data=mean_by_reg_co_day.index.values,
  y_data_list=[mean_by_reg_co_day['registered'], mean_by_reg_co_day['casual']],
  y_data_names=['Registered', 'Casual'],
  colors=['#539caf', '#7663b0'],
  x_label='Day of week',
  y_label='Check outs',
  title='Check Outs By Registration Status and Day of Week (0 = Sunday)')

1772b8b52f8a471eb89c479011019e8d-image.png

箱式图

# 只需要指定分类的依据,就能自动绘制箱式图
days = np.unique(daily_data['weekday'])
bp_data = []
for day in days:
    bp_data.append(daily_data[daily_data['weekday'] == day]['cnt'].values)

# 定义绘图函数
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
    _, ax = plt.subplots()

    # 设置样式
  ax.boxplot(y_data,
  # 箱子是否颜色填充
  patch_artist=True,
  # 中位数线颜色
  medianprops={'color': base_color},
  # 箱子颜色设置,color:边框颜色,facecolor:填充颜色
  boxprops={'color': base_color, 'facecolor': median_color},
  # 猫须颜色whisker
  whiskerprops={'color': median_color},
  # 猫须界限颜色whisker cap
  capprops={'color': base_color})

    # 箱图与x_data保持一致
  ax.set_xticklabels(x_data)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)

# 调用绘图函数
boxplot(x_data=days,
  y_data=bp_data,
  base_color='b',
  median_color='r',
  x_label='Day of week',
  y_label='Check outs',
  title='Total Check Outs By Day of Week (0 = Sunday)')

81a3e0b31db04d2eb72c18cf36af3142-image.png

来源

@寒小阳