/capstone2019

Primary LanguageJupyter Notebook

import os
import pandas as pd
import mapreduce as mr
import csv
!ls ./trips
ltrips_short_exp123_wkdy_final_noXYcoord.txt
ltrips_short_exp13_sat_final_noXYcoord.txt
ltrips_short_exp13_sun_final_noXYcoord.txt

Trip data

relevant page

image.png

Since this data is based on survey, MTA applied and validated weight. The description can be found in the report (pg. 46)
But not fully understood! I just tried this.
Report

week = pd.read_csv('./trips/ltrips_short_exp123_wkdy_final_noXYcoord.txt', dtype=str)
print('The number of total respondents: {}'.format(len(week)))
The number of total respondents: 49167
[(i,j) for i,j in zip(week.columns, range(len(week.columns)))]
[('sampn', 0),
 ('perno', 1),
 ('tripno', 2),
 ('TRIP_ID', 3),
 ('otype', 4),
 ('dtype', 5),
 ('tpur', 6),
 ('dtime', 7),
 ('atime', 8),
 ('trip_sdate', 9),
 ('trip_edate', 10),
 ('O_COUNTY_STR', 11),
 ('D_COUNTY_STR', 12),
 ('Ptype', 13),
 ('daywk', 14),
 ('trans', 15),
 ('travy', 16),
 ('ntrav', 17),
 ('o_ntrav', 18),
 ('trtot', 19),
 ('ussob', 20),
 ('new_fare', 21),
 ('metro', 22),
 ('cash', 23),
 ('tmet', 24),
 ('o_tmet', 25),
 ('tumet', 26),
 ('o_tumet', 27),
 ('ppmet', 28),
 ('fare', 29),
 ('esub1', 30),
 ('esub2', 31),
 ('esub3', 32),
 ('esub4', 33),
 ('esub5', 34),
 ('esub6', 35),
 ('esub7', 36),
 ('esub8', 37),
 ('trip_count', 38),
 ('rectype', 39),
 ('s_dat', 40),
 ('s_dur', 41),
 ('ctfip', 42),
 ('incen', 43),
 ('stype', 44),
 ('lang', 45),
 ('hhnum', 46),
 ('hhveh', 47),
 ('ltele', 48),
 ('ctele', 49),
 ('cctel', 50),
 ('hinet', 51),
 ('hmrac', 52),
 ('o_hmrac', 53),
 ('ccmin', 54),
 ('ccinc', 55),
 ('cclin', 56),
 ('hcity', 57),
 ('hstat', 58),
 ('hzip', 59),
 ('hcnty', 60),
 ('puma', 61),
 ('MODE_G2', 62),
 ('MODE_G3', 63),
 ('MODE_G5', 64),
 ('MODE_G6', 65),
 ('MODE_G8', 66),
 ('MODE_G9', 67),
 ('MODE_G10', 68),
 ('PER_DAYTYPE', 69),
 ('TT_DAYTYPE', 70),
 ('MODE1', 71),
 ('MODE2', 72),
 ('MODE3', 73),
 ('MODE4', 74),
 ('MODE5', 75),
 ('MODE6', 76),
 ('MODE7', 77),
 ('MODE8', 78),
 ('MODE9', 79),
 ('MODE10', 80),
 ('MODE11', 81),
 ('MODE12', 82),
 ('MODE13', 83),
 ('MODE14', 84),
 ('MODE15', 85),
 ('MODE16', 86),
 ('O_Boro', 87),
 ('D_Boro', 88),
 ('HH_adults', 89),
 ('HH_workers', 90),
 ('R_BORO', 91),
 ('HH_SIZE', 92),
 ('HH_INC4', 93),
 ('HH_INC7', 94),
 ('HH_ETHNIC', 95),
 ('O_PURP', 96),
 ('O_NYTTAZ', 97),
 ('O_RTFMTAZ', 98),
 ('O_TRACT', 99),
 ('O_PUMA', 100),
 ('O_COUNTY', 101),
 ('D_PURP', 102),
 ('D_NYTTAZ', 103),
 ('D_RTFMTAZ', 104),
 ('D_TRACT', 105),
 ('D_PUMA', 106),
 ('D_COUNTY', 107),
 ('O_PURP_M', 108),
 ('D_PURP_M', 109),
 ('NSUB', 110),
 ('NNYBUS', 111),
 ('NCRAIL', 112),
 ('NOTHTRAN', 113),
 ('StopAreaNo', 114),
 ('HR_DEP', 115),
 ('HR_ARR', 116),
 ('TRIP_MIN', 117),
 ('EXP1_FINAL_WKD', 118),
 ('EXP21_FINAL_WKD', 119),
 ('EXP22_FINAL_WKD', 120),
 ('EXP31_FINAL_WKD', 121),
 ('EXP32_FINAL_WKD', 122),
 ('hmsex', 123),
 ('hmage', 124),
 ('SIRTOA_FLAG', 125),
 ('O_PLANDIST', 126),
 ('D_PLANDIST', 127)]
sum(week['EXP32_FINAL_WKD'].astype(float).dropna())
14308914.969999703
week[['O_TRACT','D_TRACT']][week['O_TRACT'] == week['D_TRACT']]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
O_TRACT D_TRACT
13 36005017300.00 36005017300.00
34 36005017300.00 36005017300.00
38 36005005902.00 36005005902.00
47 36005005901.00 36005005901.00
48 36005005901.00 36005005901.00
49 36005005901.00 36005005901.00
54 36005005901.00 36005005901.00
62 36005006500.00 36005006500.00
63 36005006500.00 36005006500.00
64 36005006500.00 36005006500.00
65 36005006500.00 36005006500.00
67 36061016400.00 36061016400.00
68 36061016400.00 36061016400.00
73 36061016400.00 36061016400.00
74 36061016400.00 36061016400.00
86 36005004700.00 36005004700.00
99 36005004700.00 36005004700.00
100 36005004700.00 36005004700.00
116 36005019900.00 36005019900.00
117 36005019900.00 36005019900.00
158 36005006500.00 36005006500.00
180 36005025500.00 36005025500.00
192 36005023702.00 36005023702.00
193 36005023702.00 36005023702.00
194 36005023702.00 36005023702.00
195 36005023702.00 36005023702.00
209 36061030300.00 36061030300.00
210 36061030300.00 36061030300.00
234 36005005302.00 36005005302.00
235 36005005302.00 36005005302.00
... ... ...
48349 36047054700.00 36047054700.00
48350 36047054700.00 36047054700.00
48353 36047054700.00 36047054700.00
48365 36047052900.00 36047052900.00
48366 36047052900.00 36047052900.00
48382 36047055100.00 36047055100.00
48401 36047089400.00 36047089400.00
48402 36047089400.00 36047089400.00
48411 36047114201.00 36047114201.00
48492 36061000700.00 36061000700.00
48493 36061000700.00 36061000700.00
48528 36047042900.00 36047042900.00
48529 36047042900.00 36047042900.00
48539 36047042900.00 36047042900.00
48540 36047042900.00 36047042900.00
48615 34037373900.00 34037373900.00
48618 36061014200.00 36061014200.00
48619 36061014200.00 36061014200.00
48759 36061027100.00 36061027100.00
48796 36061024500.00 36061024500.00
48797 36061024500.00 36061024500.00
48803 36061027100.00 36061027100.00
48836 36061009800.00 36061009800.00
48837 36061009800.00 36061009800.00
48861 36061025100.00 36061025100.00
49013 36061000800.00 36061000800.00
49014 36061000800.00 36061000800.00
49140 36061026900.00 36061026900.00
49141 36061026900.00 36061026900.00
49149 36061027900.00 36061027900.00

3205 rows × 2 columns

3205 trips has same origin and destination which is 6% of the total respondents.

week_dif_OD = week[week['O_TRACT'] != week['D_TRACT']]
len(week_dif_OD)
45962
# With OD same rows

reader = enumerate([(tract[:-3],1) for tract in week['O_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
o_tracts = list(mr.run(reader,mapper,reducer))

reader = enumerate([(tract[:-3],1) for tract in week['D_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
d_tracts = list(mr.run(reader,mapper,reducer))
# Without OD same rows

reader = enumerate([(tract[:-3],1) for tract in week_dif_OD['O_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
o_tracts = list(mr.run(reader,mapper,reducer))

reader = enumerate([(tract[:-3],1) for tract in week_dif_OD['D_TRACT'].astype(str)])
def mapper(_,o):
    yield(o)
def reducer(o,count):
    yield(o,sum(count))
d_tracts = list(mr.run(reader,mapper,reducer))
# with open('eggs.csv', 'w', newline='') as csvfile:
#     spamwriter = csv.writer(csvfile, delimiter=' ',
#                             quotechar='|', quoting=csv.QUOTE_MINIMAL)
#     for i in o_tracts:
#         spamwriter.writerow(i)

# with open('spam.csv', 'w', newline='') as csvfile:
#     spamwriter = csv.writer(csvfile, delimiter=' ',
#                             quotechar='|', quoting=csv.QUOTE_MINIMAL)
#     for i in d_tracts:
#         spamwriter.writerow(i)
trip_o = pd.DataFrame(o_tracts, columns=['tracts','trip_o'])
trip_d = pd.DataFrame(d_tracts, columns=['tracts','trip_d'])
display(trip_o.head(), trip_d.head())
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
tracts trip_o
0 11001005900 2
1 11001008600 1
2 19061000100 1
3 25019950100 1
4 25019950400 1
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
tracts trip_d
0 1
1 11001008600 1
2 25019950400 1
3 25027759100 4
4 34001002400 5

Census Tracts

Census Tract Code Format

image.png

  • census tract code
  • state + county + tract Counties in NYC
    county code in NYC
    State Code of NYC : 36
  • Manhattan - New York County (061)
  • Bronx - Bronx County (005)
  • Brooklyn - Kings County (047)
  • Queens - Queens County (081)
  • Staten Island - Richmond County (085)
ct2000 = pd.read_csv('nyct2000wi.csv', dtype=str)
ct2000.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
BoroCode BoroName CTLabel the_geom CT2000 BoroCT2000 CDEligibil NTACode NTANAme PUMA Shape_Leng Shape_Area
0 1 Manhattan 98 MULTIPOLYGON (((-73.96432543478758 40.75638153... 009800 1009800 I MN19 Turtle Bay-East Midtown 3808 5534.20005976 1906016.374
1 1 Manhattan 100 MULTIPOLYGON (((-73.96802436915851 40.75957814... 010000 1010000 I MN19 Turtle Bay-East Midtown 3808 5692.16866575 1860938.35958
2 1 Manhattan 190 MULTIPOLYGON (((-73.94505127984516 40.80259859... 019000 1019000 E MN11 Central Harlem South 3803 4231.8265884 1117371.65978
3 1 Manhattan 206 MULTIPOLYGON (((-73.93580780201182 40.80949763... 020600 1020600 E MN03 Central Harlem North-Polo Grounds 3803 5176.87315098 1602693.82096
4 1 Manhattan 217.02 MULTIPOLYGON (((-73.9470345118215 40.815829904... 021702 1021702 E MN03 Central Harlem North-Polo Grounds 3803 3338.290909 446993.881638
ct2000.BoroName.unique()
array(['Manhattan', 'Bronx', 'Queens', 'Brooklyn', 'Staten Island'],
      dtype=object)
ct2000['cdct'][ct2000['BoroName']=='Manhattan'] = '36061' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Bronx'] = '36005' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Brooklyn'] = '36047' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Queens'] = '36081' + ct2000['CT2000']
ct2000['cdct'][ct2000['BoroName']=='Staten Island'] = '36085' + ct2000['CT2000']
ct2000[:2]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
BoroCode BoroName CTLabel the_geom CT2000 BoroCT2000 CDEligibil NTACode NTANAme PUMA Shape_Leng Shape_Area cdct
0 1 Manhattan 98 MULTIPOLYGON (((-73.96432543478758 40.75638153... 009800 1009800 I MN19 Turtle Bay-East Midtown 3808 5534.20005976 1906016.374 36061009800
1 1 Manhattan 100 MULTIPOLYGON (((-73.96802436915851 40.75957814... 010000 1010000 I MN19 Turtle Bay-East Midtown 3808 5692.16866575 1860938.35958 36061010000
print(ct2000.cdct.dtype, trip_o.tracts.dtype, trip_d.tracts.dtype )
print(trip_o.trip_o.dtype, trip_d.trip_d.dtype)
object object object
int64 int64
mrg = pd.merge(ct2000, trip_o, how='inner', left_on='cdct', right_on='tracts')
mrg = pd.merge(mrg, trip_d, how='inner', left_on='cdct', right_on='tracts')
print(len(mrg))
display(mrg.head())
2120
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
BoroCode BoroName CTLabel the_geom CT2000 BoroCT2000 CDEligibil NTACode NTANAme PUMA Shape_Leng Shape_Area cdct tracts_x trip_o tracts_y trip_d
0 1 Manhattan 98 MULTIPOLYGON (((-73.96432543478758 40.75638153... 009800 1009800 I MN19 Turtle Bay-East Midtown 3808 5534.20005976 1906016.374 36061009800 36061009800 72 36061009800 74
1 1 Manhattan 100 MULTIPOLYGON (((-73.96802436915851 40.75957814... 010000 1010000 I MN19 Turtle Bay-East Midtown 3808 5692.16866575 1860938.35958 36061010000 36061010000 145 36061010000 140
2 1 Manhattan 190 MULTIPOLYGON (((-73.94505127984516 40.80259859... 019000 1019000 E MN11 Central Harlem South 3803 4231.8265884 1117371.65978 36061019000 36061019000 36 36061019000 33
3 1 Manhattan 206 MULTIPOLYGON (((-73.93580780201182 40.80949763... 020600 1020600 E MN03 Central Harlem North-Polo Grounds 3803 5176.87315098 1602693.82096 36061020600 36061020600 9 36061020600 9
4 1 Manhattan 217.02 MULTIPOLYGON (((-73.9470345118215 40.815829904... 021702 1021702 E MN03 Central Harlem North-Polo Grounds 3803 3338.290909 446993.881638 36061021702 36061021702 13 36061021702 13
mrg.to_csv('od_trip_ct2008.csv')

aggregating with population data

!wget -O pop_ct.csv https://data.cityofnewyork.us/api/views/kc6e-jm93/rows.csv?accessType=DOWNLOAD
--2019-02-27 08:48:28--  https://data.cityofnewyork.us/api/views/kc6e-jm93/rows.csv?accessType=DOWNLOAD
Resolving data.cityofnewyork.us (data.cityofnewyork.us)... 52.206.68.26, 52.206.140.199, 52.206.140.205
Connecting to data.cityofnewyork.us (data.cityofnewyork.us)|52.206.68.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘pop_ct.csv’

pop_ct.csv              [ <=>                ] 132.31K  --.-KB/s    in 0.002s  

Last-modified header invalid -- time-stamp ignored.
2019-02-27 08:48:29 (57.7 MB/s) - ‘pop_ct.csv’ saved [135483]
pop2000 = pd.read_csv('pop_ct.csv', dtype=str)
pop2000 = pop2000[pop2000['Year'] == '2000']
pop2000.Year.unique()
array(['2000'], dtype=object)
pop2000.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Borough Year FIPS County Code DCP Borough Code Census Tract Population
0 Bronx 2000 005 2 000100 12780
1 Bronx 2000 005 2 000200 3545
2 Bronx 2000 005 2 000400 3314
3 Bronx 2000 005 2 001600 5237
4 Bronx 2000 005 2 001900 1584
pop2000['cdct'] = '36' + pop2000['FIPS County Code'] + pop2000['Census Tract']
pop2000.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Borough Year FIPS County Code DCP Borough Code Census Tract Population cdct
0 Bronx 2000 005 2 000100 12780 36005000100
1 Bronx 2000 005 2 000200 3545 36005000200
2 Bronx 2000 005 2 000400 3314 36005000400
3 Bronx 2000 005 2 001600 5237 36005001600
4 Bronx 2000 005 2 001900 1584 36005001900
mrg = pd.merge(mrg, pop2000, how='inner', on='cdct')
mrg.columns
Index(['BoroCode', 'BoroName', 'CTLabel', 'the_geom', 'CT2000', 'BoroCT2000',
       'CDEligibil', 'NTACode', 'NTANAme', 'PUMA', 'Shape_Leng', 'Shape_Area',
       'cdct', 'tracts_x', 'trip_o', 'tracts_y', 'trip_d', 'Borough', 'Year',
       'FIPS County Code', 'DCP Borough Code', 'Census Tract', 'Population'],
      dtype='object')
mrg = mrg[['BoroName', 'the_geom', 'cdct', 'trip_o', 'trip_d', 'Year', 'Population']]
mrg.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
BoroName the_geom cdct trip_o trip_d Year Population
0 Manhattan MULTIPOLYGON (((-73.96432543478758 40.75638153... 36061009800 72 74 2000 7066
1 Manhattan MULTIPOLYGON (((-73.96802436915851 40.75957814... 36061010000 145 140 2000 1822
2 Manhattan MULTIPOLYGON (((-73.94505127984516 40.80259859... 36061019000 36 33 2000 1818
3 Manhattan MULTIPOLYGON (((-73.93580780201182 40.80949763... 36061020600 9 9 2000 2310
4 Manhattan MULTIPOLYGON (((-73.93669078722161 40.83719324... 36061024900 6 6 2000 1150
mrg['trip_o'] - mrg['trip_d']
0       -2
1        5
2        3
3        0
4        0
5        6
6       -6
7        0
8        1
9        3
10       1
11       3
12       2
13     -11
14       2
15      -1
16      -6
17       1
18       1
19      -1
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
1821     0
1822    -2
1823     1
1824    -1
1825     0
1826     0
1827    -7
1828     0
1829     1
1830     0
1831     0
1832     0
1833     0
1834    -5
1835    -1
1836     0
1837     0
1838     6
1839    -1
1840    -1
1841     1
1842     0
1843     0
1844     1
1845    -1
1846     0
1847     0
1848     0
1849     0
1850     0
Length: 1851, dtype: int64
mrg.to_csv('trip_with_pop.csv', index=False)

image.png

Figure 0. Origin Data

image.png

Figure 0. Destination data (Quantile)