statisticsはPythonの標準で用意されている統計用の各種機能を持ったライブラリです。
平均や分散、正規分布の取り扱いを行う事ができます。
以降で使うため、サンプルの正規分布のデータを用意しておきます。
import random
random.seed(0)
# 平均2 標準偏差3 の正規分布乱数100万個
data = [random.gauss(2,3) for x in range(1000000) ]
statistics.mean(data) # => 2.003101708077845
from fractions import Fraction
statistics.mean([Fraction(1, 4), Fraction(1, 5), Fraction(1, 6), Fraction(1, 7)])
# => Fraction(319, 1680)
from decimal import Decimal
statistics.mean([Decimal("0.1"), Decimal("0.2"), Decimal("0.3"), Decimal("0.4")])
# => Fraction(319, 1680)
statistics.fmean(data) # => 2.003101708077845
from fractions import Fraction
statistics.fmean([Fraction(1, 4), Fraction(1, 5), Fraction(1, 6), Fraction(1, 7)])
# => 0.18988095238095237
from decimal import Decimal
statistics.fmean([Decimal("0.1"), Decimal("0.2"), Decimal("0.3"), Decimal("0.4")])
# => 0.25
statistics.geometric_mean(filter(lambda x: x > 0, data)) # => 2.348852376873991
statistics.geometric_mean([24,9,125]) # => 30.000000000000004
statistics.harmonic_mean(filter(lambda x: x > 0, data)) # => 0.5049265887002424
statistics.harmonic_mean([40, 60]) #=> 48.0
statistics.median(data) # => 2.0027491546951763
statistics.median([1,2,5,6,9]) # => 5
statistics.median([1,2,5,6,8,9]) # => 5.5
statistics.median_low(data) # => 2.002745121193964
statistics.median_low([1,2,5,6,9]) # => 5
statistics.median_low([1,2,5,6,8,9]) # => 5
statistics.median_high(data) # => 2.002753188196389
statistics.median_high([1,2,5,6,9]) # => 5
statistics.median_high([1,2,5,6,8,9]) # => 6
statistics.median_grouped(data) # => 1.502753188196389
statistics.median_grouped([1,2,3]) # => 2.0
statistics.median_grouped([1,2,2]) # => 1.75
statistics.median_grouped([1,2,2,3]) # => 2.0
statistics.median_grouped([1,2,2,3,3]) # => 2.25
statistics.mode(data) # => 4.825146214041993
statistics.mode([1,1,2,3,3,3,3,4,5,5,6,6,6,6]) # => 3
statistics.mode(['a','a','b','b','b','c','c','c','d']) # => 'b'
statistics.multimode(data) # => 2.025128538506135, 3.5108901691355143,…
statistics.multimode([1,1,2,3,3,3,3,4,5,5,6,6,6,6]) # => [3, 6]
statistics.multimode(['a','a','b','b','b','c','c','c','d']) # => ['b', 'c']
statistics.pstdev(data) # => 2.9998192000383304
statistics.pvariance(data) # => 8.998915232918609
random.seed(0)
loop = 10
stdevs = []
for i in range(loop):
sample_data = random.sample(data,10000)
stdevs.append(statistics.stdev(sample_data))
statistics.mean(stdevs)
# => 3.013484862216274
random.seed(0)
loop = 10
variances = []
for i in range(loop):
sample_data = random.sample(data,10000)
variances.append(statistics.variance(sample_data))
statistics.mean(variances)
# => 9.081665198551224
tiles = statistics.quantiles(data)
tiles # => [-0.017355856620645316, 2.0027491546951763, 4.026714654241148]
len(data) # => 1000000
sum([(1 if x <= tiles[0] else 0) for x in data]) # => 250000
sum([(1 if tiles[0] < x and x <= tiles[1] else 0) for x in data]) # => 250000
sum([(1 if tiles[1] < x and x <= tiles[2] else 0) for x in data]) # => 250000
sum([(1 if tiles[2] < x else 0) for x in data]) # => 250000
# 10分割
tiles = statistics.quantiles(data, n=10)
tiles # => [-1.8438196482118687, -0.5228557865524757, 0.42921456410951775,
# 1.2410669510576056, 2.0027491546951763, 2.763201232068235,
# 3.578172336618947, 4.5272166210547855, 5.847496513505737]
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(data, bins=50)
ax.set_xticks([x for x in range(-5,8)])
tiles = statistics.quantiles(data)
ax.axvline(tiles[0], color='r', linestyle='-')
ax.axvline(tiles[1], color='r', linestyle='-')
ax.axvline(tiles[2], color='r', linestyle='-')
plt.show()
nd = statistics.NormalDist(1.0, 2.0)
nd # => NormalDist(mu=1.0, sigma=2.0)
nd.mean # => 1.0 平均
nd.median # => 1.0 中央値
nd.mode # => 1.0 最頻値
nd.stdev # => 2.0 標準偏差
nd.variance # => 4.0 分散
statistics.NormalDist.from_samples(data)
# => NormalDist(mu=2.003101708077845, sigma=2.9998206999490558)
nd.samples(2) # => [0.2114438019581495, 1.6427031628717423]
nd.pdf(0) # => 0.17603266338214976
nd.pdf(1) # => 0.19947114020071635
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = [-10+0.01*v for v in range(2100)]
y = [nd.pdf(v) for v in x]
plt.plot(x, y)
plt.show()
nd.cdf(0) # => 0.30853753872598694
nd.cdf(1) # => 0.5
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = [-10+0.01*v for v in range(2100)]
y = [nd.cdf(v) for v in x]
plt.plot(x, y)
plt.show()
nd.inv_cdf(0.1) # => -1.5631031310892016
nd.inv_cdf(0.5) # => 1.0
nd.inv_cdf(0.99) # => 5.6526957480816815
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = [0.01*v for v in range(1,100)]
y = [nd.inv_cdf(v) for v in x]
plt.plot(x, y)
plt.show()
nd1 = statistics.NormalDist(0.0, 1.0)
nd2 = statistics.NormalDist(3.0, 1.5)
nd1.overlap(nd2) # => 0.22381671704369155
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = [-10+0.01*v for v in range(2100)]
y1 = [nd1.pdf(v) for v in x]
y2 = [nd2.pdf(v) for v in x]
lap_y = [r if r <=s else s for r,s in zip(y1,y2)]
plt.plot(x, y1)
plt.plot(x, y2)
plt.fill_between(x,0,lap_y,facecolor='g',alpha=0.5)
plt.show()
sum([y*0.01 for y in lap_y]) # => 0.22381625164019625
nd.quantiles()
# => [-0.3489795003921634, 1.0, 2.348979500392163]
nd.quantiles(n=6)
# => [-0.9348431322034028, 0.1385454014090851, 1.0, 1.8614545985909148, 2.9348431322034028]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
x = [-10+0.01*v for v in range(2100)]
y = [nd.pdf(v) for v in x]
plt.plot(x, y)
quantiles = nd.quantiles(n=6)
ax.axvline(quantiles[0], color='r', linestyle='-')
ax.axvline(quantiles[1], color='r', linestyle='-')
ax.axvline(quantiles[2], color='r', linestyle='-')
ax.axvline(quantiles[3], color='r', linestyle='-')
ax.axvline(quantiles[4], color='r', linestyle='-')
plt.show()
nd.zscore(0) # => -0.5
nd.zscore(1) # => 0.0
nd.zscore(2) # => 0.5
nd.zscore(3) # => 1.0
nd = statistics.NormalDist(1.0, 2.0)
nd # => NormalDist(mu=1.0, sigma=2.0)
nd*2 # => NormalDist(mu=2.0, sigma=4.0)
nd*3 # => NormalDist(mu=3.0, sigma=6.0)
nd+1 # => NormalDist(mu=2.0, sigma=2.0)
nd+2 # =>NormalDist(mu=3.0, sigma=2.0)
nd*2+1 # =>NormalDist(mu=3.0, sigma=4.0)
Crieitは誰でも投稿できるサービスです。 是非記事の投稿をお願いします。どんな軽い内容でも投稿できます。
また、「こんな記事が読みたいけど見つからない!」という方は是非記事投稿リクエストボードへ!
こじんまりと作業ログやメモ、進捗を書き残しておきたい方はボード機能をご利用ください。
ボードとは?
コメント