/global-max-sim-matrix

Compute Global Maximum Similarity Matrix for 2 Sequences

Primary LanguageRGNU General Public License v3.0GPL-3.0

global-max-sim-matrix

R

Compute Global Max Similarity Matrix for 2 Sequences

Conor Heffron 2024-07-16

See overiew & notes at the link below:

Objective: Compute the global max similarity matrix between the two sequences:

  • AAGTGCCTCAAGATA
  • ACCGTCTCAGCAATA

Load scripts & libraries

library(devtools)
## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.3.2
library(stringr)

# use_r("global-max-sim-matrix")
# source('R/global-max-sim-matrix.R')

devtools::install_github("conorheffron/global-max-sim-matrix")
## Skipping install of 'global.max.sim.matrix' from a github remote, the SHA1 (0c1fae77) has not changed since last install.
##   Use `force = TRUE` to force installation
load_all()
## ℹ Loading global.max.sim.matrix

Define Sequence 1

seq1 <- 'AAGTGCCTCAAGATA'
seq1
## [1] "AAGTGCCTCAAGATA"

Define Sequence 2

seq2 <- 'ACCGTCTCAGCAATA'
seq2
## [1] "ACCGTCTCAGCAATA"

Get Sequence 1 & 2 Lengths

m <- nchar(seq1)
n <- nchar(seq2)
print(paste('Sequence Length (seq1, seq2):', m, n))
## [1] "Sequence Length (seq1, seq2): 15 15"

Create Initial Matrix Object & fill with Zeros

# Construct default similarity matrix and fill with 0's
sim_matrix <- matrix(0, m+1, n+1)
sim_matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [2,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [3,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [4,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [5,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [6,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [7,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [8,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [9,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [10,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [11,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [12,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [13,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [14,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [15,]    0    0    0    0    0    0    0    0    0     0     0     0     0
## [16,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##       [,14] [,15] [,16]
##  [1,]     0     0     0
##  [2,]     0     0     0
##  [3,]     0     0     0
##  [4,]     0     0     0
##  [5,]     0     0     0
##  [6,]     0     0     0
##  [7,]     0     0     0
##  [8,]     0     0     0
##  [9,]     0     0     0
## [10,]     0     0     0
## [11,]     0     0     0
## [12,]     0     0     0
## [13,]     0     0     0
## [14,]     0     0     0
## [15,]     0     0     0
## [16,]     0     0     0

Compute First Column Values

sim_matrix <- init_matrix_col1(sim_matrix, m)
sim_matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0    0    0    0    0    0    0    0    0     0     0     0     0
##  [2,]   -2    0    0    0    0    0    0    0    0     0     0     0     0
##  [3,]   -4    0    0    0    0    0    0    0    0     0     0     0     0
##  [4,]   -6    0    0    0    0    0    0    0    0     0     0     0     0
##  [5,]   -8    0    0    0    0    0    0    0    0     0     0     0     0
##  [6,]  -10    0    0    0    0    0    0    0    0     0     0     0     0
##  [7,]  -12    0    0    0    0    0    0    0    0     0     0     0     0
##  [8,]  -14    0    0    0    0    0    0    0    0     0     0     0     0
##  [9,]  -16    0    0    0    0    0    0    0    0     0     0     0     0
## [10,]  -18    0    0    0    0    0    0    0    0     0     0     0     0
## [11,]  -20    0    0    0    0    0    0    0    0     0     0     0     0
## [12,]  -22    0    0    0    0    0    0    0    0     0     0     0     0
## [13,]  -24    0    0    0    0    0    0    0    0     0     0     0     0
## [14,]  -26    0    0    0    0    0    0    0    0     0     0     0     0
## [15,]  -28    0    0    0    0    0    0    0    0     0     0     0     0
## [16,]  -30    0    0    0    0    0    0    0    0     0     0     0     0
##       [,14] [,15] [,16]
##  [1,]     0     0     0
##  [2,]     0     0     0
##  [3,]     0     0     0
##  [4,]     0     0     0
##  [5,]     0     0     0
##  [6,]     0     0     0
##  [7,]     0     0     0
##  [8,]     0     0     0
##  [9,]     0     0     0
## [10,]     0     0     0
## [11,]     0     0     0
## [12,]     0     0     0
## [13,]     0     0     0
## [14,]     0     0     0
## [15,]     0     0     0
## [16,]     0     0     0

Compute First Row Values

sim_matrix <- init_matrix_row1(sim_matrix, n)
sim_matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0   -2   -4   -6   -8  -10  -12  -14  -16   -18   -20   -22   -24
##  [2,]   -2    0    0    0    0    0    0    0    0     0     0     0     0
##  [3,]   -4    0    0    0    0    0    0    0    0     0     0     0     0
##  [4,]   -6    0    0    0    0    0    0    0    0     0     0     0     0
##  [5,]   -8    0    0    0    0    0    0    0    0     0     0     0     0
##  [6,]  -10    0    0    0    0    0    0    0    0     0     0     0     0
##  [7,]  -12    0    0    0    0    0    0    0    0     0     0     0     0
##  [8,]  -14    0    0    0    0    0    0    0    0     0     0     0     0
##  [9,]  -16    0    0    0    0    0    0    0    0     0     0     0     0
## [10,]  -18    0    0    0    0    0    0    0    0     0     0     0     0
## [11,]  -20    0    0    0    0    0    0    0    0     0     0     0     0
## [12,]  -22    0    0    0    0    0    0    0    0     0     0     0     0
## [13,]  -24    0    0    0    0    0    0    0    0     0     0     0     0
## [14,]  -26    0    0    0    0    0    0    0    0     0     0     0     0
## [15,]  -28    0    0    0    0    0    0    0    0     0     0     0     0
## [16,]  -30    0    0    0    0    0    0    0    0     0     0     0     0
##       [,14] [,15] [,16]
##  [1,]   -26   -28   -30
##  [2,]     0     0     0
##  [3,]     0     0     0
##  [4,]     0     0     0
##  [5,]     0     0     0
##  [6,]     0     0     0
##  [7,]     0     0     0
##  [8,]     0     0     0
##  [9,]     0     0     0
## [10,]     0     0     0
## [11,]     0     0     0
## [12,]     0     0     0
## [13,]     0     0     0
## [14,]     0     0     0
## [15,]     0     0     0
## [16,]     0     0     0
# define score look-up map
d <- c("match" = 1, "mismatch" = -1, "gap" = -2)
d
##    match mismatch      gap 
##        1       -1       -2

Compute Global Max Similarity Matrix for seq1 & seq2

sim_matrix <- populate_global_max_sim_matrix(sim_matrix, seq1, seq2, d, m, n)
sim_matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0   -2   -4   -6   -8  -10  -12  -14  -16   -18   -20   -22   -24
##  [2,]   -2    1   -1   -3   -5   -7   -9  -11  -13   -15   -17   -19   -21
##  [3,]   -4   -1    0   -2   -4   -6   -8  -10  -12   -12   -14   -16   -18
##  [4,]   -6   -3   -2   -1   -1   -3   -5   -7   -9   -11   -11   -13   -15
##  [5,]   -8   -5   -4   -3   -2    0   -2   -4   -6    -8   -10   -12   -14
##  [6,]  -10   -7   -6   -5   -2   -2   -1   -3   -5    -7    -7    -9   -11
##  [7,]  -12   -9   -6   -5   -4   -3   -1   -2   -2    -4    -6    -6    -8
##  [8,]  -14  -11   -8   -5   -6   -5   -2   -2   -1    -3    -5    -5    -7
##  [9,]  -16  -13  -10   -7   -6   -5   -4   -1   -3    -2    -4    -6    -6
## [10,]  -18  -15  -12   -9   -8   -7   -4   -3    0    -2    -3    -3    -5
## [11,]  -20  -17  -14  -11  -10   -9   -6   -5   -2     1    -1    -3    -2
## [12,]  -22  -19  -16  -13  -12  -11   -8   -7   -4    -1     0    -2    -2
## [13,]  -24  -21  -18  -15  -12  -13  -10   -9   -6    -3     0    -1    -3
## [14,]  -26  -23  -20  -17  -14  -13  -12  -11   -8    -5    -2    -1     0
## [15,]  -28  -25  -22  -19  -16  -13  -14  -11  -10    -7    -4    -3    -2
## [16,]  -30  -27  -24  -21  -18  -15  -14  -13  -12    -9    -6    -5    -2
##       [,14] [,15] [,16]
##  [1,]   -26   -28   -30
##  [2,]   -23   -25   -27
##  [3,]   -20   -22   -24
##  [4,]   -17   -19   -21
##  [5,]   -16   -16   -18
##  [6,]   -13   -15   -17
##  [7,]   -10   -12   -14
##  [8,]    -9   -11   -13
##  [9,]    -8    -8   -10
## [10,]    -7    -9    -9
## [11,]    -4    -6    -8
## [12,]    -1    -3    -5
## [13,]    -3    -2    -4
## [14,]    -2    -4    -1
## [15,]    -1    -1    -3
## [16,]    -1    -2     0

Label columns and rows

# colnames(sim_matrix) <- c('-', 'A', 'A', 'G', 'T', 'G', 'C', 'C', 'T', 'C', 'A', 'A', 'G', 'A', 'T', 'A')
colnames(sim_matrix) <- unlist(stringr::str_split(paste('-', seq1, sep=''), boundary("character")))

# rownames(sim_matrix) <- c('-', 'A', 'C', 'C', 'G', 'T', 'C', 'T', 'C', 'A', 'G', 'C', 'A', 'A', 'T', 'A')
rownames(sim_matrix) <- unlist(stringr::str_split(paste('-', seq2, sep=''), boundary("character")))

Final Matrix Output

sim_matrix
##     -   A   A   G   T   G   C   C   T   C   A   A   G   A   T   A
## -   0  -2  -4  -6  -8 -10 -12 -14 -16 -18 -20 -22 -24 -26 -28 -30
## A  -2   1  -1  -3  -5  -7  -9 -11 -13 -15 -17 -19 -21 -23 -25 -27
## C  -4  -1   0  -2  -4  -6  -8 -10 -12 -12 -14 -16 -18 -20 -22 -24
## C  -6  -3  -2  -1  -1  -3  -5  -7  -9 -11 -11 -13 -15 -17 -19 -21
## G  -8  -5  -4  -3  -2   0  -2  -4  -6  -8 -10 -12 -14 -16 -16 -18
## T -10  -7  -6  -5  -2  -2  -1  -3  -5  -7  -7  -9 -11 -13 -15 -17
## C -12  -9  -6  -5  -4  -3  -1  -2  -2  -4  -6  -6  -8 -10 -12 -14
## T -14 -11  -8  -5  -6  -5  -2  -2  -1  -3  -5  -5  -7  -9 -11 -13
## C -16 -13 -10  -7  -6  -5  -4  -1  -3  -2  -4  -6  -6  -8  -8 -10
## A -18 -15 -12  -9  -8  -7  -4  -3   0  -2  -3  -3  -5  -7  -9  -9
## G -20 -17 -14 -11 -10  -9  -6  -5  -2   1  -1  -3  -2  -4  -6  -8
## C -22 -19 -16 -13 -12 -11  -8  -7  -4  -1   0  -2  -2  -1  -3  -5
## A -24 -21 -18 -15 -12 -13 -10  -9  -6  -3   0  -1  -3  -3  -2  -4
## A -26 -23 -20 -17 -14 -13 -12 -11  -8  -5  -2  -1   0  -2  -4  -1
## T -28 -25 -22 -19 -16 -13 -14 -11 -10  -7  -4  -3  -2  -1  -1  -3
## A -30 -27 -24 -21 -18 -15 -14 -13 -12  -9  -6  -5  -2  -1  -2   0

Run tests

devtools::test()
## ℹ Testing global.max.sim.matrix

## ✔ | F W  S  OK | Context
## ⠏ |          0 | global-max-sim-matrix                                                [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0   -2   -4   -6   -8  -10  -12  -14  -16   -18   -20   -22   -24
##  [2,]   -2    1   -1   -3   -5   -7   -9  -11  -13   -15   -17   -19   -21
##  [3,]   -4   -1    0   -2   -4   -6   -8  -10  -12   -12   -14   -16   -18
##  [4,]   -6   -3   -2   -1   -1   -3   -5   -7   -9   -11   -11   -13   -15
##  [5,]   -8   -5   -4   -3   -2    0   -2   -4   -6    -8   -10   -12   -14
##  [6,]  -10   -7   -6   -5   -2   -2   -1   -3   -5    -7    -7    -9   -11
##  [7,]  -12   -9   -6   -5   -4   -3   -1   -2   -2    -4    -6    -6    -8
##  [8,]  -14  -11   -8   -5   -6   -5   -2   -2   -1    -3    -5    -5    -7
##  [9,]  -16  -13  -10   -7   -6   -5   -4   -1   -3    -2    -4    -6    -6
## [10,]  -18  -15  -12   -9   -8   -7   -4   -3    0    -2    -3    -3    -5
## [11,]  -20  -17  -14  -11  -10   -9   -6   -5   -2     1    -1    -3    -2
## [12,]  -22  -19  -16  -13  -12  -11   -8   -7   -4    -1     0    -2    -2
## [13,]  -24  -21  -18  -15  -12  -13  -10   -9   -6    -3     0    -1    -3
## [14,]  -26  -23  -20  -17  -14  -13  -12  -11   -8    -5    -2    -1     0
## [15,]  -28  -25  -22  -19  -16  -13  -14  -11  -10    -7    -4    -3    -2
## [16,]  -30  -27  -24  -21  -18  -15  -14  -13  -12    -9    -6    -5    -2
##       [,14] [,15] [,16]
##  [1,]   -26   -28   -30
##  [2,]   -23   -25   -27
##  [3,]   -20   -22   -24
##  [4,]   -17   -19   -21
##  [5,]   -16   -16   -18
##  [6,]   -13   -15   -17
##  [7,]   -10   -12   -14
##  [8,]    -9   -11   -13
##  [9,]    -8    -8   -10
## [10,]    -7    -9    -9
## [11,]    -4    -6    -8
## [12,]    -1    -3    -5
## [13,]    -3    -2    -4
## [14,]    -2    -4    -1
## [15,]    -1    -1    -3
## [16,]    -1    -2     0
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
##  [1,]    0   -2   -4   -6   -8  -10  -12  -14  -16   -18   -20   -22   -24
##  [2,]   -2    1   -1   -3   -5   -7   -9  -11  -13   -15   -17   -19   -21
##  [3,]   -4   -1    0   -2   -4   -6   -8  -10  -12   -12   -14   -16   -18
##  [4,]   -6   -3   -2   -1   -1   -3   -5   -7   -9   -11   -11   -13   -15
##  [5,]   -8   -5   -4   -3   -2    0   -2   -4   -6    -8   -10   -12   -14
##  [6,]  -10   -7   -6   -5   -2   -2   -1   -3   -5    -7    -7    -9   -11
##  [7,]  -12   -9   -6   -5   -4   -3   -1   -2   -2    -4    -6    -6    -8
##  [8,]  -14  -11   -8   -5   -6   -5   -2   -2   -1    -3    -5    -5    -7
##  [9,]  -16  -13  -10   -7   -6   -5   -4   -1   -3    -2    -4    -6    -6
## [10,]  -18  -15  -12   -9   -8   -7   -4   -3    0    -2    -3    -3    -5
## [11,]  -20  -17  -14  -11  -10   -9   -6   -5   -2     1    -1    -3    -2
## [12,]  -22  -19  -16  -13  -12  -11   -8   -7   -4    -1     0    -2    -2
## [13,]  -24  -21  -18  -15  -12  -13  -10   -9   -6    -3     0    -1    -3
## [14,]  -26  -23  -20  -17  -14  -13  -12  -11   -8    -5    -2    -1     0
## [15,]  -28  -25  -22  -19  -16  -13  -14  -11  -10    -7    -4    -3    -2
## [16,]  -30  -27  -24  -21  -18  -15  -14  -13  -12    -9    -6    -5    -2
##       [,14] [,15] [,16]
##  [1,]   -26   -28   -30
##  [2,]   -23   -25   -27
##  [3,]   -20   -22   -24
##  [4,]   -17   -19   -21
##  [5,]   -16   -16   -18
##  [6,]   -13   -15   -17
##  [7,]   -10   -12   -14
##  [8,]    -9   -11   -13
##  [9,]    -8    -8   -10
## [10,]    -7    -9    -9
## [11,]    -4    -6    -8
## [12,]    -1    -3    -5
## [13,]    -3    -2    -4
## [14,]    -2    -4    -1
## [15,]    -1    -1    -3
## [16,]    -1    -2     0
## ✔ |          4 | global-max-sim-matrix
## 
## ══ Results ═════════════════════════════════════════════════════════════════════
## [ FAIL 0 | WARN 0 | SKIP 0 | PASS 4 ]