This project is based on the paper titled "Pixel Embedding for Grayscale Medical Image Classification". The paper successfully extends existing text embedding architectures to enable grayscale medical image classification. In this approach, each pixel of the image is converted into a token, and all pixels are flattened in a specific order. This flattened representation is then utilized as input to train a text embedding model for classification purposes. The spatial relationship features of the grayscale image are captured using the n-gram mechanism commonly employed in NLP models. The framework used in this project is a lightweight solution that relies on the text embedding tool StarSpace. It does not require independent CUDA devices and is not dependent on popular computer vision or machine learning libraries such as Torch or OpenCV.
[刘文粟] (Liu Wensu)
[王璐] (Wang Lu)
[周晓北] (Zhou Xiaobei)
- R (>=3.5.3)
Temporally, use:
source("~/pixel_embedding/R/code.R")
Classification for Medical MNIST dataset
##download dataset##
## https://www.kaggle.com/datasets/andrewmvd/medical-mnist
library(stringr)
library(dplyr)
library(ruimtehol)
library(OpenImageR)
dir <- "/home/share/data/embedding/mnist_med"
f <- list.files(dir,recursive=T,full.names=T)
x <- lapply(f,readImage)
x <- lapply(x,resizeImage,width=28,height=28,method="bilinear",normalize_pixels=TRUE)
y <-gsub("/home/share/data/embedding/mnist_med/","",f)
y <-gsub("/.*.jpeg","",y)
y <- as.factor(y)
set.seed(200)
ind <- sample(2, length(x), replace = TRUE, prob = c(0.8,0.2))
train_y <- y[ind==1]
test_y <- y[ind==2]
flatten=c("c","r","d","v")
filter=c(3,26,3,26)
X <- trans(x,flatten=flatten,filter=filter)
## no filter ##
##X <- trans(x,flatten=flatten) ##
train_x <- X[ind==1]
test_x <- X[ind==2]
filename <- "textspace_tmp.txt"
filename_validation <- "textspace_validation_tmp.txt"
if (file.exists(filename)) file.remove(filename)
if (file.exists(filename_validation)) file.remove(filename_validation)
x1 <- train_x
y1 <- train_y
label <- "__label__"
targets <- ifelse(is.na(y1),NA_character_,paste(label,y1,sep=""))
x1 <- ifelse(is.na(targets), x1, paste(targets,x1,sep = " "))
early_stopping <- 0.1
s <- unique(y1)
n <- sapply(s,function(z) sum(y1==z))
n <- round(early_stopping*min(n))
id <- lapply(s,function(z) sample(which(y1==z),n))
id <- unlist(id)
writeLines(text=x1[-id],con=filename,useBytes=FALSE)
writeLines(text=x1[id],con=filename_validation,useBytes=FALSE)
model <- starspace(model="tagspace.bin",file=filename,trainMode=0,fileFormat="fastText",ngrams=10,
validationFile=filename_validation,dim=20,validationPatience=5,lr=0.001,epoch=30,minCount=1,
adagrad=F,useWeight=T,similarity="dot",loss="hinge",thread=40,negSearchLimit=1000,bucket=500000,maxNegSamples=100)
if(n<200)
{
m <- as.matrix(model,type="all")
model <- starspace(embeddings=m,model="tagspace.bin",file=filename,trainMode=0,fileFormat="fastText",ngrams=10,
validationFile=filename_validation,dim=20,validationPatience=10,lr=0.0005,epoch=30,minCount=1,
adagrad=F,useWeight=T,similarity="dot",loss="hinge",thread=40,negSearchLimit=1000,bucket=500000,maxNegSamples=100)
}
p <- predict(model,test_x,k=1)
p <- sapply(p,function(x) x$prediction$label)
tab <- table(Predicted=p,Actual=test_y)
acc <- sum(diag(tab)/sum(tab))
Classification for Chest X-ray Pneumonia dataset
library(stringr)
library(dplyr)
library(ruimtehol)
library(OpenImageR)
dir <- "/home/share/data/embedding/chest_x_ray/chest_xray"
f <- list.files(dir,recursive=T,full.names=T)
x <- lapply(f,readImage)
x <- lapply(x,function(z) {if(class(z)=="matrix")
y=z
else y=z[,,1]
y})
x <- lapply(x,resizeImage,width=28,height=28,method="bilinear",normalize_pixels=TRUE)
min1 <- min(unlist(x),na.rm=TRUE)
max1 <- max(unlist(x),na.rm=TRUE)
x <- lapply(x,function(z) (z-min1)/(max1-min1))
y <- rep("1",length(f))
y[grepl("NORMAL",f)] <- "0"
ind <- rep(1,length(f))
ind[grepl("test",f)] <- 2
train_y <- y[ind==1]
test_y <- y[ind==2]
train_x <- x[ind==1]
test_x <- x[ind==2]
keep <- c(which(train_y=="1")[1:2500],which(train_y=="0"))
train_y <- train_y[keep]
train_x <- train_x[keep]
library(stringr)
x <- append(train_x,test_x)
ind <- c(rep(1,length(train_x)),rep(2,length(test_x)))
flatten=c("c","r","d","v")
filter=c(5,22,3,25)
#5,22,3,25
X <- trans(x,flatten=flatten,filter=filter)
## no filter ##
##X <- trans(x,flatten=flatten) ##
train_x <- X[ind==1]
test_x <- X[ind==2]
filename <- "textspace_tmp.txt"
filename_validation <- "textspace_validation_tmp.txt"
if (file.exists(filename)) file.remove(filename)
if (file.exists(filename_validation)) file.remove(filename_validation)
x1 <- train_x
y1 <- train_y
label <- "__label__"
targets <- ifelse(is.na(y1),NA_character_,paste(label,y1,sep=""))
x1 <- ifelse(is.na(targets), x1, paste(targets,x1,sep = " "))
early_stopping <- 0.1
s <- unique(y1)
n <- sapply(s,function(z) sum(y1==z))
n <- round(early_stopping*min(n))
id <- lapply(s,function(z) sample(which(y1==z),n))
id <- unlist(id)
writeLines(text=x1[-id],con=filename,useBytes=FALSE)
writeLines(text=x1[id],con=filename_validation,useBytes=FALSE)
model <- starspace(model="tagspace.bin",file=filename,trainMode=0,fileFormat="fastText",ngrams=10,
validationFile=filename_validation,dim=20,validationPatience=5,lr=0.001,epoch=30,minCount=1,
adagrad=F,useWeight=T,similarity="dot",loss="hinge",thread=40,negSearchLimit=1000,bucket=500000,maxNegSamples=100)
if(n<200)
{
m <- as.matrix(model,type="all")
model <- starspace(embeddings=m,model="tagspace.bin",file=filename,trainMode=0,fileFormat="fastText",ngrams=10,
validationFile=filename_validation,dim=20,validationPatience=10,lr=0.0005,epoch=30,minCount=1,
adagrad=F,useWeight=T,similarity="dot",loss="hinge",thread=40,negSearchLimit=1000,bucket=500000,maxNegSamples=100)
}
p <- predict(model,test_x,k=1)
p <- sapply(p,function(x) x$prediction$label)
tab <- table(Predicted=p,Actual=test_y)
acc <- sum(diag(tab)/sum(tab))
Classification for Curated Covid CT dataset
##download dataset##
##https://www.kaggle.com/datasets/mehradaria/covid19-lung-ct-scans
library(stringr)
library(dplyr)
library(ruimtehol)
library(OpenImageR)
dir <- "/home/share/data/embedding/covid_lung_ct/curated_data/curated_data/"
f <- list.files(dir,recursive=T,full.names=T)
x <- lapply(f,function(x) {y <- try(png::readPNG(x),silent=TRUE)
if(class(y)=="try-error")
y <- jpeg::readJPEG(x)
y})
x <- lapply(x,function(z) {if(class(z)=="matrix")
y=z
else y=z[,,1]
y})
x <- lapply(x,resizeImage,width=28,height=28,method="bilinear",normalize_pixels=TRUE)
min1 <- min(unlist(x),na.rm=TRUE)
max1 <- max(unlist(x),na.rm=TRUE)
x <- lapply(x,function(z) (z-min1)/(max1-min1))
y <- rep("1",length(f))
y[grepl("NonCOVID",f)] <- "0"
y[grepl("CAP",f)] <- "2"
set.seed(200)
ind <- sample(2, length(x),replace=TRUE,prob=c(0.8,0.2))
train_y <- y[ind==1]
test_y <- y[ind==2]
flatten=c("c","r","d","v")
filter=c(3,26,3,26)
## no filter ##
##X <- trans(x,flatten=flatten) ##
X <- trans(x,flatten=flatten,filter=filter)
train_x <- X[ind==1]
test_x <- X[ind==2]
filename <- "textspace_tmp.txt"
filename_validation <- "textspace_validation_tmp.txt"
if (file.exists(filename)) file.remove(filename)
if (file.exists(filename_validation)) file.remove(filename_validation)
x1 <- train_x
y1 <- train_y
label <- "__label__"
targets <- ifelse(is.na(y1),NA_character_,paste(label,y1,sep=""))
x1 <- ifelse(is.na(targets), x1, paste(targets,x1,sep = " "))
early_stopping <- 0.1
s <- unique(y1)
n <- sapply(s,function(z) sum(y1==z))
n <- round(early_stopping*min(n))
id <- lapply(s,function(z) sample(which(y1==z),n))
id <- unlist(id)
writeLines(text=x1[-id],con=filename,useBytes=FALSE)
writeLines(text=x1[id],con=filename_validation,useBytes=FALSE)
model <- starspace(model="tagspace.bin",file=filename,trainMode=0,fileFormat="fastText",ngrams=10,
validationFile=filename_validation,dim=20,validationPatience=5,lr=0.001,epoch=30,minCount=1,
adagrad=F,useWeight=T,similarity="dot",loss="hinge",thread=40,negSearchLimit=1000,bucket=500000,maxNegSamples=100)
if(n<200)
{
m <- as.matrix(model,type="all")
model <- starspace(embeddings=m,model="tagspace.bin",file=filename,trainMode=0,fileFormat="fastText",ngrams=10,
validationFile=filename_validation,dim=20,validationPatience=10,lr=0.0005,epoch=30,minCount=1,
adagrad=F,useWeight=T,similarity="dot",loss="hinge",thread=40,negSearchLimit=1000,bucket=500000,maxNegSamples=100)
}
p <- predict(model,test_x,k=1)
p <- sapply(p,function(x) x$prediction$label)
tab <- table(Predicted=p,Actual=test_y)
acc <- sum(diag(tab)/sum(tab))