#Read in the original data. This is a slightly different set than # the one we saw in class, and is formatted nicely. # You can also find other seasons at: # http://www.phys.utk.edu/sorensen/cfr/cfr/Output/2008/CF_2008_Main.html # You might need to count the column widths to make sure everything # is read in correctly though. # footfull<-read.fwf("http://www.phys.utk.edu/sorensen/cfr/cfr/Output/2008/CF_2008_Ascii_Games.txt", widths=c(5,3,3,2,5,5,4,5,30,1,30)) footfull<-footfull[(footfull[,7]>=0),-10] #Get rid of the odd group of schools that only # play each other and nobody else # badschool<-c("Trinity College CT ", "Amherst College ", "Williams College ", "Middlebury College ", "Tufts University ", "Bowdoin College ", "Colby College ", "Hamilton College ", "Bates College ", "Wesleyan University ") bad<-rep(0,dim(footfull)[1]) for (i in 1:length(badschool)){ for (j in 1:(dim(footfull)[1])){ if (as.character(footfull[j,9])==badschool[i]){bad[j]<-1} if (as.character(footfull[j,10])==badschool[i]){bad[j]<-1} }} foot<-footfull[bad==0,] #Set up the X matrix and Y vector # hometeams<-as.character(foot[,10]) awayteams<-as.character(foot[,9]) footx<-c(hometeams,awayteams) teams<-dimnames(table(footx))$footx nt<-length(teams) scnum<-(1:nt)[teams=="South Carolina "] ng<-nrow(foot) X<-matrix(0,ncol=nt+1,nrow=ng) X[,(nt+1)]<-2-foot[,4] Y<-as.numeric(foot[,8])-as.numeric(foot[,7]) colnames(X)<-c(teams,"H") for (i in 1:ng){ X[i,hometeams[i]]<-1 X[i,awayteams[i]]<--1 X[i,nt+1]<-1 } #Take out South Carolina so that it serves as the baseline (0 rating) # X<-X[,-scnum] #Do the regression # output<-lm(Y~0+X) #The homefield advantage # output$coefficients[nt] #The standard error # summary(output)$sigma #The r-squared # summary(output)$r.squared #This will give you the top 50 teams with SC put back in # ratings<-as.matrix(c(output$coefficients[1:(nt-2)],0),ncol=1) rownames(ratings)<-c(colnames(X)[1:(nt-2)],"South Carolina ") cbind(1:50,ratings[order(ratings,decreasing=TRUE)[1:50],]) #How does the model look? #The actual data plotted against the estimated values # plot(output$fitted.values,Y) abline(lm(Y~output$fitted.values)) #The residual versus predicted plot # plot(output$fitted.values,output$residuals) lines(c(-100,100),c(0,0)) #The qq-plot of the residuals # qqnorm(output$residual) qqline(output$residual) #Checking it team by team # whichteam="South Carolina " results<-cbind(foot[,c(10,8,9,7)],foot[,8]-foot[,7],output$fitted.values,output$residual) colnames(results)<-c("HTeam","HScore","ATeam","AScore","(H-A)", "Pred (H-A)","Resid") SCres<-results[((results[,1]==whichteam)| (results[,3]==whichteam)),] SCres[,1]<-abbreviate(SCres[,1],12) SCres[,3]<-abbreviate(SCres[,3],12) SCres #Mean and SD of the residuals # mean(SCres[,7]) sd(SCres[,7]) #Make adjusted residuals so that # a negative would mean that the "whichteam" team is over-rated # a positive would mean that the "whichteam" team is under-rated # adjres<-c(results[(results[,1]==whichteam),7],-results[(results[,3]==whichteam),7]) mean(adjres) sd(adjres)