With a small sample of discographies pulled from the Spotify API, I build two popularity prediction models based on musical key. First, a normal-normal mixture model, with both unique and fixed variance iterations. I use the EM algorithm to find maximum likelihood estimates for each unknown parameter. Second, in a hierarchical bayesian framework, where songs are nested within keys, I build credible intervals for a single band, specifically using the Gibbs sampler to draw samples from the posterior distribution.
A normal-normal mixture model with EM algorithm, then hierarchical bayes. See Markdown Link
#Expectation function
Estep <- function(y,alpha,mu0,mu1,sigsq0,sigsq1){
n <- length(y)
ind <- rep(NA,n)
for (i in 1:n){
prob0 <- (1-alpha)*dnorm(y[i],mean=mu0,sd=sqrt(sigsq0))
prob1 <- alpha*dnorm(y[i],mean=mu1,sd=sqrt(sigsq1))
ind[i] <- prob1/(prob0+prob1)
}
ind
}
#Maximization function
Mstep <- function(y,ind){
n <- length(y)
alpha <- sum(ind)/n
mu1 <- sum(ind*y)/sum(ind)
mu0 <- sum((1-ind)*y)/sum(1-ind)
sigsq1 <- sum(ind*((y-mu1)^2))/sum(ind)
sigsq0 <- sum((1-ind)*((y-mu0)^2))/sum(1-ind)
c(alpha,mu0,mu1,sigsq0,sigsq1)
}
##observed data loglikelihood function
loglik.mix <- function(y,ind,alpha,mu0,mu1,sigsq0,sigsq1){
loglik <- sum(log(alpha*dnorm(y,mu1,sqrt(sigsq1))+(1-alpha)*dnorm(y,mu0,sqrt(sigsq0))))
loglik
}
#Running EM iterations
curalpha <- 0.5
curmu0 <- 0.1
curmu1 <- 0.4
cursigsq0 <- 0.03
cursigsq1 <- 0.03
mean(mean_pop)
curind <- Estep(mean_pop,curalpha,curmu0,curmu1,cursigsq0,cursigsq1)
loglik <- loglik.mix(mean_pop,curind,curalpha,curmu0,curmu1,cursigsq0,cursigsq1)
itermat <- c(curalpha,curmu0,curmu1,cursigsq0,cursigsq1,loglik)
diff <- 1
numiters <- 1
while (diff > 0.001 || numiters <= 100){
curind <- Estep(mean_pop,curalpha,curmu0,curmu1,cursigsq0,cursigsq1)
curparam <- Mstep(mean_pop,curind)
curalpha <- curparam[1]
curmu0 <- curparam[2]
curmu1 <- curparam[3]
cursigsq0 <- curparam[4]
cursigsq1 <- curparam[5]
itermat <- rbind(itermat,c(curparam,loglik))
loglik <- loglik.mix(mean_pop,curind,curalpha,curmu0,curmu1,cursigsq0,cursigsq1)
numiters <- numiters + 1
diff <- max(abs(itermat[numiters,]-itermat[numiters-1,]))
# print (c(numiters,loglik))
}