Examining the Impact of the New York Advantage Program on the NYC Homeless Population

HPMN P8596 Final Project

Homeless Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
In [2]:
data = pd.read_csv('NYCHomeless.csv')

data.rename(columns = {'Unnamed: 0':'month_year'}, inplace = True)
data['month_year'] = data.month_year.str.replace('SEPT', 'SEP')

data['str_split'] = data.month_year.str.split(' ')
data['month'] = data.str_split.str.get(0)
#data['year'] = data.str_split.str.get(1)

from datetime import datetime
data['month_year'] = pd.to_datetime(data['month_year'], format = '%b %Y')
data['year'] = data['month_year'].dt.year

data = data[(data.year >= 1990) & (data.year <= 2017)]
del data['str_split']

Descriptive Statistics

In [3]:
# Descriptive statistics
data.describe().iloc[1:7, 0:7].round(1)
Out[3]:
total_pop total_fam persons_in_fam children adults_in_fam single_men single_women
mean 34937.9 8356.7 26338.4 14126.9 12211.5 6572.3 2005.6
std 12979.9 3403.7 10800.7 5386.7 5443.2 1699.3 816.4
min 18144.0 3196.0 9993.0 5681.0 4312.0 4652.0 994.0
25% 23400.8 5492.0 17015.8 9569.8 7477.0 5399.2 1331.5
50% 33872.0 8453.0 26618.0 14160.5 12471.5 5971.0 1872.0
75% 38980.2 9864.5 30348.0 16644.5 14239.8 6992.5 2483.0

Data Exploration

Total Homeless Population in DOH Shelters

In [4]:
ax1 = data.plot(x  = 'month_year', y = 'total_pop', rot = 70, title = 'Fig. 1. NYC Homeless Population in shelters')
ax1.set_ylabel('Number of Poeple')
Out[4]:
Text(0,0.5,'Number of Poeple')

NYC Homeless Shelter Admissions by Group, 1990~2017

In [5]:
plt.plot(data.month_year, data.children)
plt.plot(data.month_year, data.single_men)
plt.plot(data.month_year, data.single_women)
plt.plot(data.month_year, data.persons_in_fam)
plt.legend(['Children', 'Single men', 'Single women', 'Family'], loc='upper left')
plt.title('Fig. 2. Homeless Population in DOH Shelters, Stratified by Status')
plt.ylabel('Number of Poeple')
plt.show()
In [6]:
summary = data.groupby('year')['children', 'single_men', 'single_women', 'persons_in_fam'].mean()

fig=plt.figure()
ax2=fig.add_subplot(1,3,1)
ax3=fig.add_subplot(1,3,3)

summary.plot.box(ax = ax2, rot = 70, title = "Fig. 3. Homeless Population Boxplot by Group, 1990~2017")
ax2.set_ylabel('Number of Poeple')
# summary.loc[2000].plot.pie(ax = ax1, figsize=(8, 4), autopct='%1.1f%%')
summary.loc[2015].plot.pie(ax = ax3, figsize=(13, 4), autopct='%1.1f%%', title = 'Fig. 4. Homeless Population Pie Chart by Group, 2015')
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x115a6a588>

Seasonal Changes

In [7]:
f, axarr = plt.subplots(5, sharex=True)

axarr[0].plot(data[data.year == 1995].month, data[data.year == 1995].total_pop)
axarr[0].set_title('1995')

axarr[1].plot(data[data.year == 2000].month, data[data.year == 2000].total_pop)
axarr[1].set_title('2000')

axarr[2].plot(data[data.year == 2005].month, data[data.year == 2005].total_pop)
axarr[2].set_title('2005')

axarr[3].plot(data[data.year == 2010].month, data[data.year == 2010].total_pop)
axarr[3].set_title('2010')

axarr[4].plot(data[data.year == 2015].month, data[data.year == 2015].total_pop)
axarr[4].set_title('2015')

f.subplots_adjust(hspace=0.6)
axarr[2].set_ylabel('Number of People')

plt.suptitle('Fig. 5. Homeless Population in shelters, Seasonal Changes by Year')

axarr[0].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[1].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[2].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[3].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
axarr[4].tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=True)

plt.show()

NYC Population Data

In [8]:
violent = pd.read_csv('index_violent_property_and_firearm_rates.csv')
nyc_pop = violent[(violent.County == 'New York') |
                 (violent.County == 'Bronx') |
                 (violent.County == 'Kings') |
                 (violent.County == 'Richmond') |
                 (violent.County == 'Queens')]
population = nyc_pop.groupby('Year')['Population'].sum()
population = population.to_frame()
In [9]:
# Join two data
data2 = data[:]
data2 = data.groupby('year')['total_pop'].mean() #aggregate montly data to yearly, using mean pop from homeless data
data2 = data2.to_frame()

data2 = data2.join(population)
data2.reset_index(inplace=True)
newcols = {
    'total_pop':'homeless_pop', 
    'Population':'nyc_pop'
}
data2.rename(columns=newcols, inplace=True)
In [10]:
# creating a dummy variable 'policy'
data2['policy'] = 0
data2.loc[(data2.year >= 2007) & (data2.year <= 2011), 'policy'] = 1
data2['policy'] = data2.policy.astype('category')
# data2['year'] = pd.to_datetime(data2['year'], format = '%Y')
In [11]:
# NYC population plot
population.reset_index(inplace=True)
population.columns = ['year', 'nyc_pop']
population['year'] = pd.to_datetime(population['year'], format = '%Y')
In [12]:
ax4 = population.plot(x='year', y='nyc_pop', title = 'Fig. 6. NYC Total Population over Time, 1990~2007')
ax4.set_ylabel('Number of Poeple')
Out[12]:
Text(0,0.5,'Number of Poeple')

Examining Relationship Between NYC Homeless Admissions and NYC Population

In [13]:
from matplotlib import gridspec
In [14]:
data2['homeless_pop'].corr(data2['nyc_pop'])
Out[14]:
0.8859154211825996
In [15]:
fig = plt.figure()
gs = gridspec.GridSpec(2, 1, height_ratios = [1,1])
ax0 = plt.subplot(gs[0])
line0, = ax0.plot(population.year, population.nyc_pop, color = 'orange')

ax1 = plt.subplot(gs[1], sharex = ax0)
line1, = ax1.plot(data.month_year, data.total_pop, color = 'skyBlue')
plt.setp(ax0.get_xticklabels(), visible=False)
# remove last tick label for the second subplot
yticks = ax1.yaxis.get_major_ticks()
yticks[-1].label1.set_visible(False)


ax0.legend((line0, line1), ('nyc population', 'homeless population'), loc='upper left')
ax0.set_ylabel('Number of Population')

plt.subplots_adjust(hspace=.0)

plt.suptitle('Fig. 7. NYC Population v. NYC Homeless Shelter Population over Time, 1990~2007')
plt.show()
In [16]:
data2.plot.scatter(x = 'nyc_pop', y = 'homeless_pop', title = 'Fig. 8. NYC Homeless Shelter Population v. NYC Population') #linear relationship bw nyc_pop and homeless_pop
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x115f5c630>
In [17]:
data2['proportion_homeless_total'] = data2['homeless_pop']/data2['nyc_pop']
ax = data2.plot.scatter(x = 'year', y = 'proportion_homeless_total', title = 'Fig. 9. Proportion of NYC Population in Homeless Shelters')
ax.set_ylim([0,0.01])
ax.set_ylabel('Proportion of total population in shelters')
Out[17]:
Text(0,0.5,'Proportion of total population in shelters')
In [18]:
print("Average percentage of homeless population (in shelter) over nyc populaiton = {:.5f}%".format(data2["proportion_homeless_total"].mean()*100))
Average percentage of homeless population (in shelter) over nyc populaiton = 0.43229%
In [19]:
data2["inc_rate_homeless"] = np.nan
data2["inc_rate_nyc"] = np.nan

for i in range(1, len(data2)):
    data2.iloc[i, 5] = ((data2.iloc[i, 1] - data2.iloc[i-1, 1]) * 100.0 / data2.iloc[i-1, 1])
    
for i in range(1, len(data2)):
    data2.iloc[i, 6] = ((data2.iloc[i, 2] - data2.iloc[i-1, 2]) * 100.0 / data2.iloc[i-1, 2])

Linear Regression & Modeling Predictions

In [20]:
from sklearn import linear_model
In [21]:
bf2007 = data2.iloc[0:17]
af2007 = data2.iloc[17:]

Regression 1: Proportion ~ Year

In [22]:
reg1 = linear_model.LinearRegression()
reg1.fit(bf2007[['year']], bf2007.proportion_homeless_total)
data2['prediction1'] = reg1.predict(data2[['year']]).reshape(-1,1)

ax = data2.plot.scatter(x = 'year', y = 'proportion_homeless_total')
plt.plot(data2.year, data2.prediction1, color = 'orange')
plt.legend(['predicted homeless population', 'actual homeless population'], loc='upper left')
ax.axvline(x=2007, color='k', linestyle='--', label='2007')
ax.axvline(x=2011, color='k', linestyle='--', label='2011')

ax.set_ylabel('Proportion of total population in shelters')
ax.set_ylim([0,0.01])
plt.title('Fig. 10. Predicted Proportion ')
plt.show()

This linear regression is fitted using data from 1990 to 2006. The predicion shows the trend in proportion without the policy implemented.

In [24]:
coefficients=pd.DataFrame({'variable':['year'],'coefficient':reg1.coef_})
coefficients
Out[24]:
variable coefficient
0 year 0.000096
In [25]:
print('R_sqr:',reg1.score(bf2007[['year']], bf2007.proportion_homeless_total))
R_sqr: 0.5989843235444907

Regression 2: Homeless population ~ NYC total population

In [26]:
reg2 = linear_model.LinearRegression()
reg2.fit(bf2007[['nyc_pop']], bf2007.homeless_pop)
af2007['prediction2'] = reg2.predict(af2007[['nyc_pop']]).reshape(-1,1)
/usr/local/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
In [27]:
ax = data2.plot(x = 'year', y = 'homeless_pop')
plt.plot(af2007.year, af2007.prediction2)
ax.axvline(x=2007, color='k', linestyle='--', label='2007')
ax.axvline(x=2011, color='k', linestyle='--', label='2011')
plt.legend(['actual homeless population', 'predicted homeless population if policy not implemented'], loc='upper left')
ax.set_ylabel('Homeless population')
plt.title('Fig. 11. Predicted Homeless Population')
plt.show()

This linear regression is fitted using data from 1990 to 2006. The predicion shows the trend in homeless population without the policy implemented.