Examining the Impact of the New York Advantage Program on the NYC Homeless Population¶

HPMN P8596 Final Project

Homeless Data¶

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv('NYCHomeless.csv')

data.rename(columns = {'Unnamed: 0':'month_year'}, inplace = True)
data['month_year'] = data.month_year.str.replace('SEPT', 'SEP')

data['str_split'] = data.month_year.str.split(' ')
data['month'] = data.str_split.str.get(0)
#data['year'] = data.str_split.str.get(1)

from datetime import datetime
data['month_year'] = pd.to_datetime(data['month_year'], format = '%b %Y')
data['year'] = data['month_year'].dt.year

data = data[(data.year >= 1990) & (data.year <= 2017)]
del data['str_split']

Descriptive Statistics

# Descriptive statistics
data.describe().iloc[1:7, 0:7].round(1)

Data Exploration

Total Homeless Population in DOH Shelters

ax1 = data.plot(x  = 'month_year', y = 'total_pop', rot = 70, title = 'Fig. 1. NYC Homeless Population in shelters')
ax1.set_ylabel('Number of Poeple')

Text(0,0.5,'Number of Poeple')

NYC Homeless Shelter Admissions by Group, 1990~2017

plt.plot(data.month_year, data.children)
plt.plot(data.month_year, data.single_men)
plt.plot(data.month_year, data.single_women)
plt.plot(data.month_year, data.persons_in_fam)
plt.legend(['Children', 'Single men', 'Single women', 'Family'], loc='upper left')
plt.title('Fig. 2. Homeless Population in DOH Shelters, Stratified by Status')
plt.ylabel('Number of Poeple')
plt.show()

summary = data.groupby('year')['children', 'single_men', 'single_women', 'persons_in_fam'].mean()

fig=plt.figure()
ax2=fig.add_subplot(1,3,1)
ax3=fig.add_subplot(1,3,3)

summary.plot.box(ax = ax2, rot = 70, title = "Fig. 3. Homeless Population Boxplot by Group, 1990~2017")
ax2.set_ylabel('Number of Poeple')
# summary.loc[2000].plot.pie(ax = ax1, figsize=(8, 4), autopct='%1.1f%%')
summary.loc[2015].plot.pie(ax = ax3, figsize=(13, 4), autopct='%1.1f%%', title = 'Fig. 4. Homeless Population Pie Chart by Group, 2015')

<matplotlib.axes._subplots.AxesSubplot at 0x115a6a588>

Seasonal Changes

f, axarr = plt.subplots(5, sharex=True)

axarr[0].plot(data[data.year == 1995].month, data[data.year == 1995].total_pop)
axarr[0].set_title('1995')

axarr[1].plot(data[data.year == 2000].month, data[data.year == 2000].total_pop)
axarr[1].set_title('2000')

axarr[2].plot(data[data.year == 2005].month, data[data.year == 2005].total_pop)
axarr[2].set_title('2005')

axarr[3].plot(data[data.year == 2010].month, data[data.year == 2010].total_pop)
axarr[3].set_title('2010')

axarr[4].plot(data[data.year == 2015].month, data[data.year == 2015].total_pop)
axarr[4].set_title('2015')

f.subplots_adjust(hspace=0.6)
axarr[2].set_ylabel('Number of People')

plt.suptitle('Fig. 5. Homeless Population in shelters, Seasonal Changes by Year')

axarr[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[1].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[2].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[3].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[4].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=True)

plt.show()

NYC Population Data¶

violent = pd.read_csv('index_violent_property_and_firearm_rates.csv')
nyc_pop = violent[(violent.County == 'New York') |
                 (violent.County == 'Bronx') |
                 (violent.County == 'Kings') |
                 (violent.County == 'Richmond') |
                 (violent.County == 'Queens')]
population = nyc_pop.groupby('Year')['Population'].sum()
population = population.to_frame()

# Join two data
data2 = data[:]
data2 = data.groupby('year')['total_pop'].mean() #aggregate montly data to yearly, using mean pop from homeless data
data2 = data2.to_frame()

data2 = data2.join(population)
data2.reset_index(inplace=True)
newcols = {
    'total_pop':'homeless_pop', 
    'Population':'nyc_pop'
}
data2.rename(columns=newcols, inplace=True)

# creating a dummy variable 'policy'
data2['policy'] = 0
data2.loc[(data2.year >= 2007) & (data2.year <= 2011), 'policy'] = 1
data2['policy'] = data2.policy.astype('category')
# data2['year'] = pd.to_datetime(data2['year'], format = '%Y')

# NYC population plot
population.reset_index(inplace=True)
population.columns = ['year', 'nyc_pop']
population['year'] = pd.to_datetime(population['year'], format = '%Y')

ax4 = population.plot(x='year', y='nyc_pop', title = 'Fig. 6. NYC Total Population over Time, 1990~2007')
ax4.set_ylabel('Number of Poeple')

Text(0,0.5,'Number of Poeple')

Examining Relationship Between NYC Homeless Admissions and NYC Population¶

from matplotlib import gridspec

data2['homeless_pop'].corr(data2['nyc_pop'])

0.8859154211825996

fig = plt.figure()
gs = gridspec.GridSpec(2, 1, height_ratios = [1,1])
ax0 = plt.subplot(gs[0])
line0, = ax0.plot(population.year, population.nyc_pop, color = 'orange')

ax1 = plt.subplot(gs[1], sharex = ax0)
line1, = ax1.plot(data.month_year, data.total_pop, color = 'skyBlue')
plt.setp(ax0.get_xticklabels(), visible=False)
# remove last tick label for the second subplot
yticks = ax1.yaxis.get_major_ticks()
yticks[-1].label1.set_visible(False)


ax0.legend((line0, line1), ('nyc population', 'homeless population'), loc='upper left')
ax0.set_ylabel('Number of Population')

plt.subplots_adjust(hspace=.0)

plt.suptitle('Fig. 7. NYC Population v. NYC Homeless Shelter Population over Time, 1990~2007')
plt.show()

data2.plot.scatter(x = 'nyc_pop', y = 'homeless_pop', title = 'Fig. 8. NYC Homeless Shelter Population v. NYC Population') #linear relationship bw nyc_pop and homeless_pop

<matplotlib.axes._subplots.AxesSubplot at 0x115f5c630>

data2['proportion_homeless_total'] = data2['homeless_pop']/data2['nyc_pop']
ax = data2.plot.scatter(x = 'year', y = 'proportion_homeless_total', title = 'Fig. 9. Proportion of NYC Population in Homeless Shelters')
ax.set_ylim([0,0.01])
ax.set_ylabel('Proportion of total population in shelters')

Text(0,0.5,'Proportion of total population in shelters')

print("Average percentage of homeless population (in shelter) over nyc populaiton = {:.5f}%".format(data2["proportion_homeless_total"].mean()*100))

Average percentage of homeless population (in shelter) over nyc populaiton = 0.43229%

data2["inc_rate_homeless"] = np.nan
data2["inc_rate_nyc"] = np.nan

for i in range(1, len(data2)):
    data2.iloc[i, 5] = ((data2.iloc[i, 1] - data2.iloc[i-1, 1]) * 100.0 / data2.iloc[i-1, 1])
    
for i in range(1, len(data2)):
    data2.iloc[i, 6] = ((data2.iloc[i, 2] - data2.iloc[i-1, 2]) * 100.0 / data2.iloc[i-1, 2])

Linear Regression & Modeling Predictions¶

from sklearn import linear_model

bf2007 = data2.iloc[0:17]
af2007 = data2.iloc[17:]

Regression 1: Proportion ~ Year

reg1 = linear_model.LinearRegression()
reg1.fit(bf2007[['year']], bf2007.proportion_homeless_total)
data2['prediction1'] = reg1.predict(data2[['year']]).reshape(-1,1)

ax = data2.plot.scatter(x = 'year', y = 'proportion_homeless_total')
plt.plot(data2.year, data2.prediction1, color = 'orange')
plt.legend(['predicted homeless population', 'actual homeless population'], loc='upper left')
ax.axvline(x=2007, color='k', linestyle='--', label='2007')
ax.axvline(x=2011, color='k', linestyle='--', label='2011')

ax.set_ylabel('Proportion of total population in shelters')
ax.set_ylim([0,0.01])
plt.title('Fig. 10. Predicted Proportion ')
plt.show()

This linear regression is fitted using data from 1990 to 2006. The predicion shows the trend in proportion without the policy implemented.

coefficients=pd.DataFrame({'variable':['year'],'coefficient':reg1.coef_})
coefficients

print('R_sqr:',reg1.score(bf2007[['year']], bf2007.proportion_homeless_total))

R_sqr: 0.5989843235444907

Regression 2: Homeless population ~ NYC total population

reg2 = linear_model.LinearRegression()
reg2.fit(bf2007[['nyc_pop']], bf2007.homeless_pop)
af2007['prediction2'] = reg2.predict(af2007[['nyc_pop']]).reshape(-1,1)

/usr/local/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until

ax = data2.plot(x = 'year', y = 'homeless_pop')
plt.plot(af2007.year, af2007.prediction2)
ax.axvline(x=2007, color='k', linestyle='--', label='2007')
ax.axvline(x=2011, color='k', linestyle='--', label='2011')
plt.legend(['actual homeless population', 'predicted homeless population if policy not implemented'], loc='upper left')
ax.set_ylabel('Homeless population')
plt.title('Fig. 11. Predicted Homeless Population')
plt.show()

This linear regression is fitted using data from 1990 to 2006. The predicion shows the trend in homeless population without the policy implemented.

	total_pop	total_fam	persons_in_fam	children	adults_in_fam	single_men	single_women
mean	34937.9	8356.7	26338.4	14126.9	12211.5	6572.3	2005.6
std	12979.9	3403.7	10800.7	5386.7	5443.2	1699.3	816.4
min	18144.0	3196.0	9993.0	5681.0	4312.0	4652.0	994.0
25%	23400.8	5492.0	17015.8	9569.8	7477.0	5399.2	1331.5
50%	33872.0	8453.0	26618.0	14160.5	12471.5	5971.0	1872.0
75%	38980.2	9864.5	30348.0	16644.5	14239.8	6992.5	2483.0