Chapter 7


1. Simulation to compare sample variance and ML variance estimator.

M=10000
v1=v2=me=1:M
for (i in 1:M)
{
y=rnorm(10,100,10)
me[i]=mean(y)
v1[i]=var(y)
v2[i]=var(y)*9/10
}

iq=seq(85,115,.1)
hist(me,prob=T,breaks=100,xlim=c(85,115))
lines(iq,dnorm(iq,100,sqrt(10)))

hist(v1,prob=T,breaks=50,xlim=c(0,300))
hist(v2,prob=T,breaks=50,xlim=c(0,300))
boxplot(v1,v2)
abline(h=100)

bias1=mean(v1)-100
bias2=mean(v2)-100

rmse1=sqrt(mean((v1-100)^2))
rmse2=sqrt(mean((v2-100)^2))

#distributions of variance
(y-E(y))^2 ~ sigma^2 X chisqr(n-1)

v1 ~ sigma^2/(n-1) X chisqr(n-1)
OR
v1*(n-1)/sigma^2 ~ chisqr(n-1)

hist(v1*9/100,prob=T,breaks=50)
val=seq(0,35,.1)
lines(val,dchisq(val,9))

hist(v2*10/100,prob=T,breaks=50)
val=seq(0,35,.1)
lines(val,dchisq(val,9))

2. ML version of a t-test

nll.norm=function(par,y) -sum(dnorm(y,par[1],sqrt(par[2]),log=T))

nll.gen=function(par,y1,y2)
nll.norm(par[c(1,3)],y1)+nll.norm(par[c(2,3)],y2)

M=100
N=100
p=matrix(nrow=M,ncol=2)
for (i in 1:M)
{
y1=rnorm(N)
y2=rnorm(N)
par=c(0,0,1)
gen=optim(par,nll.gen,y1=y1,y2=y2)$value
all=c(y1,y2)
m=mean(all)
le=length(all)
v=var(all)*(le-1)/le
res=nll.norm(c(m,v),all)
#res=optim(c(0,1),nll.norm,y=all)$value
g2=2*(res-gen)
p[i,1]=1-pchisq(g2,1)
p[i,2]=t.test(y1,y2,var.equal=T)$p.value
}

3. Factorial Design Between Subject ANOVA
dat=read.table("factorial.dat",header=T)
levels(dat$Ginko)=c('No.Ginko','Some.Ginko')
levels(dat$Music)=c('Music.On','Music.Off')
means=tapply(dat$IQ,list(dat$Ginko,dat$Music),mean)
barplot(means,beside=T)
tapply(dat$IQ,list(dat$Ginko,dat$Music),mean)
summary(aov(IQ~Ginko*Music,data=dat))

4. Regression
dat=read.table('regression.dat',header=T)
colnames(dat) #returns the column names of dat
plot(dat$freq,dat$rt,cex=.3,pch=20,col='red')
plot(dat$freq,dat$rt,pch=21,bg=rgb(1,0,0,.2),cex=1.5,col=NULL)

#first-order analysis (ignore differences in spread)
g=lm(dat$rt~dat$freq)
summary(g)
abline(g)
#nonparametric smooth
lines(lowess(dat$freq,dat$rt),col='blue')

#try a different function
logfreq=log2(dat$freq)
plot(logfreq,dat$rt,pch=21,bg=rgb(1,0,0,.2),cex=1.5,col=NULL,axes=F)
axis(2)
axis(1,at=log2(c(1:6,8:20)),label=c(1:6,8:20))
g=lm(dat$rt~logfreq)
summary(g)
abline(g)
#nonparametric smooth
lines(lowess(logfreq,dat$rt),col='blue')
box()

#how about a different RT metric?

transformed=log2(dat$rt-.5)
plot(dat$freq,transformed,pch=21,bg=rgb(1,0,0,.2),cex=1.5)
g=lm(transformed~dat$freq)
summary(g)
abline(g)
#nonparametric smooth
lines(lowess(dat$freq,transformed),col='blue')

plot(logfreq,transformed,pch=21,bg=rgb(1,0,0,.2),cex=1.5)
g=lm(transformed~logfreq)
summary(g)
abline(g)
#nonparametric smooth
lines(lowess(logfreq,transformed),col='blue')

5. Multiple Regression
dat$length=as.factor(dat$length)
summary(lm(rt~freq+length+neigh,data=dat))
#or
summary(lm(transformed~logfreq+length+neigh,data=dat))

6. Within Subject ANOVA
dat=read.table("factorial.dat",header=T)
levels(dat$Ginko)=c('No.Ginko','Some.Ginko')
levels(dat$Music)=c('Music.On','Music.Off')
dat$Sub
means=tapply(dat$IQ,list(dat$Ginko,dat$Music),mean)
tapply(dat$IQ,list(dat$Ginko,dat$Music),mean)
summary(aov(IQ~Ginko*Music+Error(Sub/(Ginko*Music)),data=dat))

7. Another Example

Goal. Assess whether there is a lexical-distance effect.

CODES:

1: subj
2: block
3: trial # in block
4: stimulus
0 = 2
1 = 3
2 = 4
3 = 6
4 = 7
5 = 8
5: resp
0 = less than 5
1 = greater than 5
6: RT in ms
7: response accuracy
0 = correct
1 = error

################################################

dat=read.table('anova.example.dat')
is.data.frame(dat)

#look at some data
dat[1,]
dim(dat)

colnames(dat)=c('sub','blk','trl','stm','resp','rt','acc')
da$sub=as.factor(dat$sub)

########################################################
#CLEANING Which data/subjects needed cleaning

mean(dat$rt)
mean(dat$acc)
min(dat$rt)
max(dat$rt)

tapply(dat$acc,dat$sub,mean)
tmp=tapply(dat$acc,dat$sub,mean)
hist(tmp)
sort(tmp)

Lets look at Subjects 34 and 43

dat$acc[dat$sub=='34']
dat$rt[dat$sub=='34']
dat$acc[dat$sub=='43']
dat$rt[dat$sub==43]
plot(dat$rt[dat$sub==43])
plot(dat$rt[dat$sub==34])
plot(dat$rt[dat$sub==3])

Lets take out participants 34 and 43. Leaving them in would
probably not been a big deal given that we will throw away RTs from
errors, RTs that are too samll or too large.

mysubs=!(dat$sub %in% c('34','43'))
clean=dat[mysubs,]

#Parsing Out The Above Statement

dat$sub %in% c('34','43')
!(dat$sub %in% c('34','43'))
mysubs=!(dat$sub %in% c('34','43'))
dat[mysubs,]

#Data without 34 and 43.
clean=dat[mysubs,]

dim(clean) #remember 18720

#OTHER DISCARDS:
#Errors
clean=clean[clean$acc==0,]
# RTs outside of 200ms and 2000ms
before=length(clean$sub)
clean=clean[clean$rt>199 & clean$rt<2001,]
after=length(clean$sub)

#LOOK FOR LEARNING AND?OR FATIGUE

a=tapply(clean$rt,clean$blk,mean)
plot(a)
plot(tapply(clean$rt[clean$blk==0],clean$trl[clean$blk==0],mean))
plot(tapply(clean$rt[clean$blk==1],clean$trl[clean$blk==1],mean))
trial=clean$blk*60+clean$trl
plot(tapply(clean$rt,trial,mean))

#chuck trials one after break ALWAYS DO THIS!!!!
clean=clean[clean$trl>0,]

#chuck first 20 trials in blk 0
clean=clean[clean$blk>0 | clean$trl>20,]

#############################################################3
#Analysis 1: Means.

mrt=tapply(clean$rt,clean$stm,mean)
g=barplot(mrt,ylim=c(550,660),col="slateblue4",
ylab="Mean RT for Correct Responses (ms)",xlab="Digit",xpd=F,
names.arg=c(2,3,4,6,7,8))
xpos=g[,1]
box()

#Analysis 2: ANOVA + Different Error bars

#error bar function
errbar=function(x,y,height,width,lty=1){
arrows(x,y,x,y+height,angle=90,length=width,lty=lty)
arrows(x,y,x,y-height,angle=90,length=width,lty=lty)
}

#Lets pull each subject by condition mean RT for analysis
b=tapply(clean$rt,list(clean$stm,clean$sub),mean)
sdat=as.data.frame.table(b)
colnames(sdat)=c('stm','sub','rt')
is.factor(sdat$sub)
is.factor(sdat$stm)
num.sub=length(levels(sdat$sub))
summary(aov(rt~stm+Error(sub/stm),data=sdat))
MSE=627
se=sqrt(MSE/num.sub)
errbar(xpos,mrt,rep(se,6),.05)
errbar(xpos,mrt,qt(.975,255)*rep(se,6),.1)
#Tukey HSD
hsd=se*qtukey(.95,6,num.sub*6)
errbar(xpos,mrt,rep(hsd,6),.15)

##################################################
#Analysis 3. Some more thoughts

set.of.digits=c(2,3,4,6,7,8)
digit=set.of.digits[clean$stm+1]
dist=abs(digit-5)
#check recoding
plot(clean$stm,dist)

plot(dist+rnorm(length(dist),0,.1),clean$rt,pch=19,col=rgb(1,0,0,.05))
abline(lm(clean$rt~dist))

boxplot(clean$rt~dist)

d1=dist==1.0
d3=dist==3.0
qqplot(clean$rt[d1],clean$rt[d3],pch=19,col='red')
abline(0,1)

probs=seq(.05,.95,.05)
q1=quantile(clean$rt[d1],probs)
q3=quantile(clean$rt[d3],probs)
points(q1,q3,pch=3,cex=1.5)

AttachmentSize
factorial.dat881 bytes
regression.dat6.75 KB
anova.example.dat531.56 KB

Research Methods 3020