HPMN P8596 Final Project
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data = pd.read_csv('NYCHomeless.csv')
data.rename(columns = {'Unnamed: 0':'month_year'}, inplace = True)
data['month_year'] = data.month_year.str.replace('SEPT', 'SEP')
data['str_split'] = data.month_year.str.split(' ')
data['month'] = data.str_split.str.get(0)
#data['year'] = data.str_split.str.get(1)
from datetime import datetime
data['month_year'] = pd.to_datetime(data['month_year'], format = '%b %Y')
data['year'] = data['month_year'].dt.year
data = data[(data.year >= 1990) & (data.year <= 2017)]
del data['str_split']
Descriptive Statistics
# Descriptive statistics
data.describe().iloc[1:7, 0:7].round(1)
Data Exploration
Total Homeless Population in DOH Shelters
ax1 = data.plot(x = 'month_year', y = 'total_pop', rot = 70, title = 'Fig. 1. NYC Homeless Population in shelters')
ax1.set_ylabel('Number of Poeple')
NYC Homeless Shelter Admissions by Group, 1990~2017
plt.plot(data.month_year, data.children)
plt.plot(data.month_year, data.single_men)
plt.plot(data.month_year, data.single_women)
plt.plot(data.month_year, data.persons_in_fam)
plt.legend(['Children', 'Single men', 'Single women', 'Family'], loc='upper left')
plt.title('Fig. 2. Homeless Population in DOH Shelters, Stratified by Status')
plt.ylabel('Number of Poeple')
plt.show()
summary = data.groupby('year')['children', 'single_men', 'single_women', 'persons_in_fam'].mean()
fig=plt.figure()
ax2=fig.add_subplot(1,3,1)
ax3=fig.add_subplot(1,3,3)
summary.plot.box(ax = ax2, rot = 70, title = "Fig. 3. Homeless Population Boxplot by Group, 1990~2017")
ax2.set_ylabel('Number of Poeple')
# summary.loc[2000].plot.pie(ax = ax1, figsize=(8, 4), autopct='%1.1f%%')
summary.loc[2015].plot.pie(ax = ax3, figsize=(13, 4), autopct='%1.1f%%', title = 'Fig. 4. Homeless Population Pie Chart by Group, 2015')
Seasonal Changes
f, axarr = plt.subplots(5, sharex=True)
axarr[0].plot(data[data.year == 1995].month, data[data.year == 1995].total_pop)
axarr[0].set_title('1995')
axarr[1].plot(data[data.year == 2000].month, data[data.year == 2000].total_pop)
axarr[1].set_title('2000')
axarr[2].plot(data[data.year == 2005].month, data[data.year == 2005].total_pop)
axarr[2].set_title('2005')
axarr[3].plot(data[data.year == 2010].month, data[data.year == 2010].total_pop)
axarr[3].set_title('2010')
axarr[4].plot(data[data.year == 2015].month, data[data.year == 2015].total_pop)
axarr[4].set_title('2015')
f.subplots_adjust(hspace=0.6)
axarr[2].set_ylabel('Number of People')
plt.suptitle('Fig. 5. Homeless Population in shelters, Seasonal Changes by Year')
axarr[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[1].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[2].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[3].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[4].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=True)
plt.show()
violent = pd.read_csv('index_violent_property_and_firearm_rates.csv')
nyc_pop = violent[(violent.County == 'New York') |
(violent.County == 'Bronx') |
(violent.County == 'Kings') |
(violent.County == 'Richmond') |
(violent.County == 'Queens')]
population = nyc_pop.groupby('Year')['Population'].sum()
population = population.to_frame()
# Join two data
data2 = data[:]
data2 = data.groupby('year')['total_pop'].mean() #aggregate montly data to yearly, using mean pop from homeless data
data2 = data2.to_frame()
data2 = data2.join(population)
data2.reset_index(inplace=True)
newcols = {
'total_pop':'homeless_pop',
'Population':'nyc_pop'
}
data2.rename(columns=newcols, inplace=True)
# creating a dummy variable 'policy'
data2['policy'] = 0
data2.loc[(data2.year >= 2007) & (data2.year <= 2011), 'policy'] = 1
data2['policy'] = data2.policy.astype('category')
# data2['year'] = pd.to_datetime(data2['year'], format = '%Y')
# NYC population plot
population.reset_index(inplace=True)
population.columns = ['year', 'nyc_pop']
population['year'] = pd.to_datetime(population['year'], format = '%Y')
ax4 = population.plot(x='year', y='nyc_pop', title = 'Fig. 6. NYC Total Population over Time, 1990~2007')
ax4.set_ylabel('Number of Poeple')
from matplotlib import gridspec
data2['homeless_pop'].corr(data2['nyc_pop'])
fig = plt.figure()
gs = gridspec.GridSpec(2, 1, height_ratios = [1,1])
ax0 = plt.subplot(gs[0])
line0, = ax0.plot(population.year, population.nyc_pop, color = 'orange')
ax1 = plt.subplot(gs[1], sharex = ax0)
line1, = ax1.plot(data.month_year, data.total_pop, color = 'skyBlue')
plt.setp(ax0.get_xticklabels(), visible=False)
# remove last tick label for the second subplot
yticks = ax1.yaxis.get_major_ticks()
yticks[-1].label1.set_visible(False)
ax0.legend((line0, line1), ('nyc population', 'homeless population'), loc='upper left')
ax0.set_ylabel('Number of Population')
plt.subplots_adjust(hspace=.0)
plt.suptitle('Fig. 7. NYC Population v. NYC Homeless Shelter Population over Time, 1990~2007')
plt.show()
data2.plot.scatter(x = 'nyc_pop', y = 'homeless_pop', title = 'Fig. 8. NYC Homeless Shelter Population v. NYC Population') #linear relationship bw nyc_pop and homeless_pop
data2['proportion_homeless_total'] = data2['homeless_pop']/data2['nyc_pop']
ax = data2.plot.scatter(x = 'year', y = 'proportion_homeless_total', title = 'Fig. 9. Proportion of NYC Population in Homeless Shelters')
ax.set_ylim([0,0.01])
ax.set_ylabel('Proportion of total population in shelters')
print("Average percentage of homeless population (in shelter) over nyc populaiton = {:.5f}%".format(data2["proportion_homeless_total"].mean()*100))
data2["inc_rate_homeless"] = np.nan
data2["inc_rate_nyc"] = np.nan
for i in range(1, len(data2)):
data2.iloc[i, 5] = ((data2.iloc[i, 1] - data2.iloc[i-1, 1]) * 100.0 / data2.iloc[i-1, 1])
for i in range(1, len(data2)):
data2.iloc[i, 6] = ((data2.iloc[i, 2] - data2.iloc[i-1, 2]) * 100.0 / data2.iloc[i-1, 2])
from sklearn import linear_model
bf2007 = data2.iloc[0:17]
af2007 = data2.iloc[17:]
Regression 1: Proportion ~ Year
reg1 = linear_model.LinearRegression()
reg1.fit(bf2007[['year']], bf2007.proportion_homeless_total)
data2['prediction1'] = reg1.predict(data2[['year']]).reshape(-1,1)
ax = data2.plot.scatter(x = 'year', y = 'proportion_homeless_total')
plt.plot(data2.year, data2.prediction1, color = 'orange')
plt.legend(['predicted homeless population', 'actual homeless population'], loc='upper left')
ax.axvline(x=2007, color='k', linestyle='--', label='2007')
ax.axvline(x=2011, color='k', linestyle='--', label='2011')
ax.set_ylabel('Proportion of total population in shelters')
ax.set_ylim([0,0.01])
plt.title('Fig. 10. Predicted Proportion ')
plt.show()
This linear regression is fitted using data from 1990 to 2006. The predicion shows the trend in proportion without the policy implemented.
coefficients=pd.DataFrame({'variable':['year'],'coefficient':reg1.coef_})
coefficients
print('R_sqr:',reg1.score(bf2007[['year']], bf2007.proportion_homeless_total))
Regression 2: Homeless population ~ NYC total population
reg2 = linear_model.LinearRegression()
reg2.fit(bf2007[['nyc_pop']], bf2007.homeless_pop)
af2007['prediction2'] = reg2.predict(af2007[['nyc_pop']]).reshape(-1,1)
ax = data2.plot(x = 'year', y = 'homeless_pop')
plt.plot(af2007.year, af2007.prediction2)
ax.axvline(x=2007, color='k', linestyle='--', label='2007')
ax.axvline(x=2011, color='k', linestyle='--', label='2011')
plt.legend(['actual homeless population', 'predicted homeless population if policy not implemented'], loc='upper left')
ax.set_ylabel('Homeless population')
plt.title('Fig. 11. Predicted Homeless Population')
plt.show()
This linear regression is fitted using data from 1990 to 2006. The predicion shows the trend in homeless population without the policy implemented.