SugiharaLab/rEDM

Prediction not NA with disjoint/segmented pred

SoftwareLiteracy opened this issue · 1 comments

As pointed out by @nonlinearnature, the current version (1.8) does not output NA in prediction rows in the presence of disjoint or segmented pred. The code makes predictions in these "gaps" using the available library vectors.

An example of the current behavior:
NOTE: This was run prior to rEDM 1.15. In 1.15 the legacy wrapper block_lnlp was deprecated.

> block <- data.frame( time=1:10, x=sin((1:10)/pi), y=cos((1:10)/pi) )
> out <- block_lnlp(block,lib=rbind(c(1,5),c(6,10)),tp=2,columns=c("x","y"),target_column = "x",stats_only = FALSE)
> out$model_output
   Index Observations Predictions Pred_Variance Const_Predictions
1      1      0.31296         NaN           NaN               NaN
2      2      0.59448         NaN           NaN               NaN
3      3      0.81627     0.95936      0.003832           0.31296
4      4      0.95606     0.89728      0.011778           0.59448
5      5      0.99978     0.88276      0.014781           0.81627
6      6      0.94307     0.89594      0.031104           0.95606
7      7      0.79160     0.59326      0.058161           0.99978
8      8      0.56060     0.26670      0.076164           0.94307
9      9      0.27328     0.28246      0.104256           0.79160
10    10     -0.04149     0.36578      0.024815           0.56060
11    11          NaN     0.09529      0.041300           0.27328
12    12          NaN     0.17593      0.055783          -0.04149

The behavior of rEDM 0.7.4 with disjoint pred:

> out74 <- block_lnlp( block, lib=rbind(c(1,5),c(6,10)), tp=2, columns=c("x","y"), target_column = "x", stats_only = FALSE )
> out74 $ model_output[[1]]
   time      obs   pred pred_var
1     3  0.81627 0.9594 0.003832
2     4  0.95606 0.8973 0.011778
3     5  0.99978 0.8828 0.014781
4     6  0.94307    NaN      NaN
5     7  0.79160    NaN      NaN
6     8  0.56060 0.2667 0.076164
7     9  0.27328 0.2825 0.104256
8    10 -0.04149 0.3658 0.024815
9   NaN      NaN    NaN      NaN
10  NaN      NaN    NaN      NaN

To clarify the issue, it is not with disjoint lib, rather disjoint pred. Disjoint lib yields results equivalent to rEDM 0.7.4.

In rEDM 0.7.4, if lib is specified and pred is not, then pred is set equal to lib. This is why the example above has NaN in prediction rows 4 and 5. In the 1.x code disjoint pred is not supported. It can be specified, but the output does not remove prediction rows in a prediction gap.

To clarify and provide examples:

library( rEDM )
library( rEDM74 )
df = data.frame( time = 1:10, x = sin((1:10)/pi), y = cos((1:10)/pi) )

# block_lnlp : Embedded = TRUE
B = rEDM74::block_lnlp( df, lib = c(1, 5, 6, 10), pred = c(1, 10),
                        method = "simplex", tp = 2, columns = c("x","y"),
                        target_column = "x", stats_only = FALSE,
                        first_column_time = TRUE )
B $ model_output[[1]]

   time      obs   pred pred_var
1     3  0.81627 0.9594 0.003832
2     4  0.95606 0.8973 0.011778
3     5  0.99978 0.8828 0.014781
4     6  0.94307 0.8959 0.031104
5     7  0.79160 0.5933 0.058161
6     8  0.56060 0.2667 0.076164
7     9  0.27328 0.2825 0.104256
8    10 -0.04149 0.3658 0.024815
9   NaN      NaN    NaN      NaN
10  NaN      NaN    NaN      NaN

rEDM::Simplex( dataFrame = df, lib = "1 5 6 10", pred = "1 10", Tp = 2,
               columns = "x y", target = "x", embedded = TRUE )

   time Observations Predictions Pred_Variance
1     1      0.31296         NaN           NaN
2     2      0.59448         NaN           NaN
3     3      0.81627     0.95936      0.003832
4     4      0.95606     0.89728      0.011778
5     5      0.99978     0.88276      0.014781
6     6      0.94307     0.89594      0.031104
7     7      0.79160     0.59326      0.058161
8     8      0.56060     0.26670      0.076164
9     9      0.27328     0.28246      0.104256
10   10     -0.04149     0.36578      0.024815
11   11          NaN     0.09529      0.041300
12   12          NaN     0.17593      0.055783

# simplex : embedded = FALSE
S = rEDM74::simplex( df[ , c('time','x') ],
                     lib = c(1, 5, 6, 10), pred = c(1, 10),
                     E = 2, tp = 2, stats_only = FALSE )
S $ model_output[[1]]

   time      obs   pred pred_var
1     3  0.81627    NaN      NaN
2     4  0.95606 0.5076  0.22429
3     5  0.99978 0.3787  0.17233
4     6  0.94307 0.4721  0.16575
5     7  0.79160 0.3880  0.12633
6     8  0.56060 0.3348  0.08463
7     9  0.27328 0.4779  0.26568
8    10 -0.04149 0.6781  0.12408
9   NaN      NaN    NaN      NaN
10  NaN      NaN    NaN      NaN

rEDM::Simplex( dataFrame = df, lib = "1 5 6 10", pred = "1 10",
               Tp = 2, E = 2, columns = "x", target = "x", embedded = FALSE )

   time Observations Predictions Pred_Variance
1     2      0.59448         NaN           NaN
2     3      0.81627         NaN           NaN
3     4      0.95606      0.5076       0.22429
4     5      0.99978      0.3787       0.17233
5     6      0.94307      0.4721       0.16575
6     7      0.79160      0.3880       0.12633
7     8      0.56060      0.3348       0.08463
8     9      0.27328      0.4779       0.26568
9    10     -0.04149      0.6781       0.12408
10   11          NaN      0.5719       0.24683
11   12          NaN      0.6466       0.22507

# Disjoint pred : not specifying pred sets pred to lib in rEDM 0.7.4
B2 = rEDM74::block_lnlp( df, lib = c(1,5,6,10), method = "simplex",
                         tp = 2, columns = c("x","y"),target_column = "x",
                         stats_only = FALSE,first_column_time = TRUE )
B2 $ model_output[[1]]

   time      obs   pred pred_var
1     3  0.81627 0.9594 0.003832
2     4  0.95606 0.8973 0.011778
3     5  0.99978 0.8828 0.014781
4     6  0.94307    NaN      NaN
5     7  0.79160    NaN      NaN
6     8  0.56060 0.2667 0.076164
7     9  0.27328 0.2825 0.104256
8    10 -0.04149 0.3658 0.024815
9   NaN      NaN    NaN      NaN
10  NaN      NaN    NaN      NaN

# 1.x code does not segment based on pred
# Since results are not affected, one can simply partition results
S2 = rEDM::Simplex( dataFrame = df, lib = "1 5 6 10", pred = "1 5 6 10",
                    Tp = 2,columns = "x y", target = "x", embedded = TRUE )
S2[ 6:7, 3:4 ] = NaN
S2

   time Observations Predictions Pred_Variance
1     1      0.31296         NaN           NaN
2     2      0.59448         NaN           NaN
3     3      0.81627     0.95936      0.003832
4     4      0.95606     0.89728      0.011778
5     5      0.99978     0.88276      0.014781
6     6      0.94307         NaN           NaN
7     7      0.79160         NaN           NaN
8     8      0.56060     0.26670      0.076164
9     9      0.27328     0.28246      0.104256
10   10     -0.04149     0.36578      0.024815
11   11          NaN     0.09529      0.041300
12   12          NaN     0.17593      0.055783

Since there is no difficulty predicting rows 6 & 7, as the library in this instance support it, is there a need to remove predictions from the output?