-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbambi_pandas.py
52 lines (37 loc) · 1.31 KB
/
bambi_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# -*- coding: utf-8 -*-
"""
Created on Sun Sep 13 17:18:16 2015
@author: jeff
download the last few years and see which are trending. add that as an
additional value to dictionary
"""
### user setable variables ###
get = 100 # how many names do you want returned?
parent1_bday = 999999
parent2_bday = 999999
due_date = 999999
aatc = 999999 # address at time of conception
start_letters = [] # restrict names to those that start with these letters
sex = 'F' # F or M
### end user setable variables ###
import matplotlib
import numpy as np
import pandas as pd
size = (parent1_bday + parent2_bday) / (due_date / aatc)
df = pd.read_csv('yob2014.txt', names=['name','sex','p'])
# refine based on sex
dfg = df[df.sex==sex]
# refine based on letter
for let in start_letters:
dfg = dfg[dfg.name.str.startswith(let)]
plog_mean = np.log(dfg.p).mean()
plog_std = np.log(dfg.p).std()
print 'mean name abundance is', np.exp(plog_mean)
print 'the standard deviation of name abundance is', np.exp(plog_std)
print 'most popular names in your subset:'
print dfg.sort('p', ascending=False).head()
# calculate weight factor based on normal distribution
prob = ( (plog_std * np.sqrt(2*np.pi))**-1 *
np.exp( -(np.log10(dfg.p) - plog_mean)**2 / (2*plog_std**2)))
# get a random sample
print dfg.sample(n=get, weights=prob, replace=False)