pandas-challenge: A Jupyter Notebook repository from vtyeh

Observed Trends:

Looking at all the reading scores and math scores in each table, students tend to do better in reading than in math.
Looking at the top performing schools, bottom performing schools, and passing rates by school type, students have a much higher passing rate in charter schools (~95%) than in district schools (~74%).
Looking at the summary of performance by spending ranges per student, it seems that students have a higher passing rate in the lower spending ranges (<$615).

# Import dependencies
import pandas as pd
import numpy as np

# Create paths to csv files
school_path = "raw_data/schools_complete.csv"
student_path = "raw_data/students_complete.csv"

# Read the csv files
schools = pd.read_csv(school_path)
schools = schools.rename(columns={"name": "school"}) # so we can merge later

students = pd.read_csv(student_path)
students = pd.DataFrame(students)

District Summary

# Calculate total schools, students, and budget of district
total_schools = schools['School ID'].count()
total_students = schools['size'].sum()
total_budget = schools['budget'].sum()

# Calculate average math and reading scores
avg_math = students['math_score'].mean()
avg_read = students['reading_score'].mean()

# Calculate percent passing rates
pass_math = students.loc[students['math_score'] >= 70]
per_math = len(pass_math)/ total_students * 100

pass_read = students.loc[students['reading_score'] >= 70]
per_read = len(pass_read)/ total_students * 100

overall = (per_math + per_read)/2 

# Create dataframe for district summary
district_summary = pd.DataFrame({"Total Schools": total_schools,
                                "Total Students": total_students,
                                "Total Budget": total_budget,
                                "Average Math Score": avg_math,
                                "Average Reading Score": avg_read,
                                "% Passing Math": per_math,
                                "% Passing Reading": per_read,
                                "% Overall Passing Rate": [overall]})

#Reorganize columns with double brackets
district_summary = district_summary[["Total Schools", "Total Students", "Total Budget", 
                                     "Average Math Score", "Average Reading Score", 
                                     "% Passing Math", "% Passing Reading", "% Overall Passing Rate"]]

# Format to "$0,000.00"
district_summary['Total Budget'] = district_summary['Total Budget'].map('${:,.2f}'.format)

district_summary

	Total Schools	Total Students	Total Budget	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	% Overall Passing Rate
0	15	39170	$24,649,428.00	78.985371	81.87784	74.980853	85.805463	80.393158

School Summary

# Merge schools and students
merge_df = pd.merge(schools, students, on="school")
merge_df = merge_df.drop(['School ID', 'Student ID'], axis=1)

# Set school as index, get type, total students, total budget and per student budget
school_index = schools.set_index('school')
school_type = school_index['type']
school_students = school_index['size']
school_budget = school_index['budget']
psb = school_budget/school_students

# Set school as index and groupby school for merge_df to get average scores by school
index = merge_df.set_index('school')
grouped = index.groupby(['school'])

# Get average math and reading score
school_avg_math = grouped['math_score'].mean()
school_avg_read = grouped['reading_score'].mean()

# Calculate percent that passed math
total_stu = grouped['name'].count()
grouped_math = pass_math.groupby('school')
school_pass_math = grouped_math['name'].count()/total_stu*100

# Calculate percent that passed reading
grouped_reading = pass_read.groupby('school')
school_pass_read = grouped_reading['name'].count()/total_stu*100
                             
# Calculate overall passing rate
overall_pass = (school_pass_math + school_pass_read)/2

# Create dataframe for school summary
school_summary = pd.DataFrame({"School Type": school_type,
                              "Total Students": school_students,
                              "Total School Budget": school_budget,
                              "Per Student Budget": psb,
                              "Average Math Score": school_avg_math,
                              "Average Reading Score": school_avg_read,
                              "% Passing Math": school_pass_math,
                              "% Passing Reading": school_pass_read,
                              "Overall Passing Rate": overall_pass})

# Reorganize school summary columns 
school_summary = school_summary[["School Type", "Total Students", "Total School Budget", 
                                 "Per Student Budget", "Average Math Score", "Average Reading Score", 
                                 "% Passing Math", "% Passing Reading", "Overall Passing Rate"]]

# Format to "$0,000.00"
school_summary['Total School Budget'] = school_summary['Total School Budget'].map('${:,.2f}'.format)
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].map('${:,.2f}'.format)

school_summary.head()

	School Type	Total Students	Total School Budget	Per Student Budget	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	Overall Passing Rate
Bailey High School	District	4976	$3,124,928.00	$628.00	77.048432	81.033963	66.680064	81.933280	74.306672
Cabrera High School	Charter	1858	$1,081,356.00	$582.00	83.061895	83.975780	94.133477	97.039828	95.586652
Figueroa High School	District	2949	$1,884,411.00	$639.00	76.711767	81.158020	65.988471	80.739234	73.363852
Ford High School	District	2739	$1,763,916.00	$644.00	77.102592	80.746258	68.309602	79.299014	73.804308
Griffin High School	Charter	1468	$917,500.00	$625.00	83.351499	83.816757	93.392371	97.138965	95.265668

Top Performing Schools (By Passing Rate)

# Create top performing schools summary by passing rate
top_summary = school_summary.loc[school_summary['Overall Passing Rate'] > 90]
top_summary.sort_values(['Overall Passing Rate'], ascending=False).head()

	School Type	Total Students	Total School Budget	Per Student Budget	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	Overall Passing Rate
Cabrera High School	Charter	1858	$1,081,356.00	$582.00	83.061895	83.975780	94.133477	97.039828	95.586652
Thomas High School	Charter	1635	$1,043,130.00	$638.00	83.418349	83.848930	93.272171	97.308869	95.290520
Pena High School	Charter	962	$585,858.00	$609.00	83.839917	84.044699	94.594595	95.945946	95.270270
Griffin High School	Charter	1468	$917,500.00	$625.00	83.351499	83.816757	93.392371	97.138965	95.265668
Wilson High School	Charter	2283	$1,319,574.00	$578.00	83.274201	83.989488	93.867718	96.539641	95.203679

Bottom Performing Schools (By Passing Rate)

# Create bottom performing schools summary by passing rate
bottom_summary = school_summary.loc[school_summary['Overall Passing Rate'] < 75]
bottom_summary.sort_values(['Overall Passing Rate'], ascending=True).head()

	School Type	Total Students	Total School Budget	Per Student Budget	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	Overall Passing Rate
Rodriguez High School	District	3999	$2,547,363.00	$637.00	76.842711	80.744686	66.366592	80.220055	73.293323
Figueroa High School	District	2949	$1,884,411.00	$639.00	76.711767	81.158020	65.988471	80.739234	73.363852
Huang High School	District	2917	$1,910,635.00	$655.00	76.629414	81.182722	65.683922	81.316421	73.500171
Johnson High School	District	4761	$3,094,650.00	$650.00	77.072464	80.966394	66.057551	81.222432	73.639992
Ford High School	District	2739	$1,763,916.00	$644.00	77.102592	80.746258	68.309602	79.299014	73.804308

Math Scores by Grade

# Math scores by grade
ninth = students.loc[students['grade']=='9th'].groupby("school")
ninth_math = ninth['math_score'].mean()
tenth = students.loc[students['grade']=='10th'].groupby("school")
tenth_math = tenth['math_score'].mean()
eleventh = students.loc[students['grade']=='11th'].groupby("school")
eleventh_math = eleventh['math_score'].mean()
twelfth = students.loc[students['grade']=='12th'].groupby("school")
twelfth_math = twelfth['math_score'].mean()

# Create dataframe for math scores summary
math_summary = pd.DataFrame({"9th": ninth_math,
                            "10th": tenth_math,
                            "11th": eleventh_math,
                            "12th": twelfth_math})
math_summary = math_summary[["9th","10th","11th","12th"]]
del math_summary.index.name

math_summary.head()

	9th	10th	11th	12th
Bailey High School	77.083676	76.996772	77.515588	76.492218
Cabrera High School	83.094697	83.154506	82.765560	83.277487
Figueroa High School	76.403037	76.539974	76.884344	77.151369
Ford High School	77.361345	77.672316	76.918058	76.179963
Griffin High School	82.044010	84.229064	83.842105	83.356164

Reading Scores by Grade

# Reading scores by grade
r_ninth = students.loc[students['grade'] == '9th'].groupby("school")
ninth_read = r_ninth['reading_score'].mean()
r_tenth = students.loc[students['grade'] == '10th'].groupby("school")
tenth_read = r_tenth['reading_score'].mean()
r_eleventh = students.loc[students['grade'] == '11th'].groupby("school")
eleventh_read = r_eleventh['reading_score'].mean()
r_twelfth = students.loc[students['grade'] == '12th'].groupby("school")
twelfth_read = r_twelfth['reading_score'].mean()

# Create dataframe for reading scores summary
read_summary = pd.DataFrame({"9th": ninth_read,
                            "10th": tenth_read,
                            "11th": eleventh_read,
                            "12th": twelfth_read})
read_summary = read_summary[["9th","10th","11th","12th"]]
del read_summary.index.name

read_summary.head()

	9th	10th	11th	12th
Bailey High School	81.303155	80.907183	80.945643	80.912451
Cabrera High School	83.676136	84.253219	83.788382	84.287958
Figueroa High School	81.198598	81.408912	80.640339	81.384863
Ford High School	80.632653	81.262712	80.403642	80.662338
Griffin High School	83.369193	83.706897	84.288089	84.013699

Scores by School Spending

# Create bins
bins = [0,585,615,645,675]

# Create names for the bins
spending_range = ['<$585','$585-615','$615-645','$645-675']

# Change formatting of per student budget in school_summary from string back to float so it can be binned 
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].str.replace('$', '')
school_summary['Per Student Budget'] = school_summary['Per Student Budget'].astype(float)
                                                      
school_summary["Spending Ranges (Per Student)"] = pd.cut(school_summary["Per Student Budget"], 
                                                         bins, labels=spending_range)
spend_summary = school_summary.groupby("Spending Ranges (Per Student)")
spend_summary = spend_summary[["Average Math Score", "Average Reading Score", "% Passing Math", 
                               "% Passing Reading", "Overall Passing Rate"]]
spend_summary.mean()

	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	Overall Passing Rate
Spending Ranges (Per Student)
<$585	83.455399	83.933814	93.460096	96.610877	95.035486
$585-615	83.599686	83.885211	94.230858	95.900287	95.065572
$615-645	79.079225	81.891436	75.668212	86.106569	80.887391
$645-675	76.997210	81.027843	66.164813	81.133951	73.649382

Scores by School Size

# Create bins
bins2 = [0, 1000, 2000, 5000]

# Create names for bins
size_range = ['Small', 'Medium', 'Large']

school_summary["School Size"] = pd.cut(school_summary["Total Students"], 
                                                         bins2, labels=size_range)
size_summary = school_summary.groupby("School Size")
size_summary = size_summary[["Average Math Score", "Average Reading Score", "% Passing Math", 
                             "% Passing Reading", "Overall Passing Rate"]]
size_summary.mean()

</style>

	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	Overall Passing Rate
School Size
Small	83.821598	83.929843	93.550225	96.099437	94.824831
Medium	83.374684	83.864438	93.599695	96.790680	95.195187
Large	77.746417	81.344493	69.963361	82.766634	76.364998

Scores by School Type

type_summary = school_summary.groupby("School Type")
type_summary = type_summary[["Average Math Score", "Average Reading Score", "% Passing Math", 
                             "% Passing Reading", "Overall Passing Rate"]]
type_summary.mean()

	Average Math Score	Average Reading Score	% Passing Math	% Passing Reading	Overall Passing Rate
School Type
Charter	83.473852	83.896421	93.620830	96.586489	95.103660
District	76.956733	80.966636	66.548453	80.799062	73.673757

vtyeh/pandas-challenge

Observed Trends:

District Summary

School Summary

Top Performing Schools (By Passing Rate)

Bottom Performing Schools (By Passing Rate)

Math Scores by Grade

Reading Scores by Grade

Scores by School Spending

Scores by School Size

Scores by School Type