| Type: | Package |
| Title: | Transfer Learning for Generalized Factor Models |
| Version: | 1.0.1 |
| Date: | 2025-11-10 |
| Maintainer: | Zhijing Wang <wangzhijing@sjtu.edu.cn> |
| Description: | Transfer learning for generalized factor models with support for continuous, count (Poisson), and binary data types. The package provides functions for single and multiple source transfer learning, source detection to identify positive and negative transfer sources, factor decomposition using Maximum Likelihood Estimation (MLE), and information criteria ('IC1' and 'IC2') for rank selection. The methods are particularly useful for high-dimensional data analysis where auxiliary information from related source datasets can improve estimation efficiency in the target domain. |
| License: | GPL-3 |
| Encoding: | UTF-8 |
| Depends: | R (≥ 3.5.0) |
| Imports: | stats |
| Suggests: | testthat (≥ 3.0.0), knitr, rmarkdown |
| RoxygenNote: | 7.3.2 |
| NeedsCompilation: | no |
| Packaged: | 2025-11-10 01:57:23 UTC; clswt-wangzhijing |
| Author: | Zhijing Wang [aut, cre] |
| Repository: | CRAN |
| Date/Publication: | 2025-11-13 18:40:02 UTC |
Information criterion (IC1/IC2) for selecting number of factors
Description
Information criterion (IC1/IC2) for selecting number of factors
Usage
ic_criterion(
X,
r_max = 10,
ic_type = c("IC1", "IC2"),
data_type = "count",
C = NULL,
max_iter = 30,
verbose = FALSE
)
Arguments
X |
Data matrix (may contain missing values coded as NA) |
r_max |
Maximum number of factors to consider (default: 10) |
ic_type |
IC criterion type: "IC1" or "IC2" (default: "IC1") |
data_type |
Type of data: "continuous", "count", or "binary" |
C |
CJMLE projection constant (if NULL, auto-calculated) |
max_iter |
Maximum CJMLE iterations (default: 30) |
verbose |
Print progress information (default: FALSE) |
Value
List with r_hat (optimal rank), ic_values, loglik_values
Examples
# Generate Poisson data with known rank
set.seed(2025)
n <- 100; p <- 100; r_true <- 2
# Generate true factors
F_true <- matrix(runif(n * r_true, min = -2, max = 2), n, r_true)
B_true <- matrix(runif(p * r_true, min = -2, max = 2), p, r_true)
M_true <- F_true %*% t(B_true)
# Generate Poisson observations
lambda <- exp(M_true)
X <- matrix(rpois(n * p, as.vector(lambda)), n, p)
# Add 10% missing values
n_missing <- floor(n * p * 0.1)
missing_idx <- sample(n * p, n_missing)
X[missing_idx] <- NA
# Use IC1 to select rank
result_IC1 <- ic_criterion(
X = X,
r_max = 6,
ic_type = "IC1",
data_type = "count",
verbose = TRUE
)
print(paste("True rank:", r_true))
print(paste("Estimated rank (IC1):", result_IC1$r_hat))
# Use IC2 to select rank
result_IC2 <- ic_criterion(
X = X,
r_max = 6,
ic_type = "IC2",
data_type = "count",
verbose = TRUE
)
Identify factor decomposition via SVD
Description
Identify factor decomposition via SVD
Usage
identify(M, r)
Arguments
M |
Matrix to decompose |
r |
Number of factors |
Value
List with F (row factors) and B (column factors)
Examples
# Generate Poisson data
set.seed(123)
n0 <- 50; p0 <- 50; r <- 2
F_true <- matrix(runif(n0 * r, min = -2, max = 2), n0, r)
B_true <- matrix(runif(p0 * r, min = -2, max = 2), p0, r)
F_true <- F_true / sqrt(r)
B_true <- B_true / sqrt(r)
M_true <- F_true %*% t(B_true)
# Decompose using identify
result <- identify(M_true, r = 2)
F_hat <- result$F
B_hat <- result$B
# Check reconstruction
M_reconstructed <- F_hat %*% t(B_hat)
print(max(abs(M_reconstructed - M_true))) # Should be very small
Calculate relative error between estimated and true matrices
Description
Calculate relative error between estimated and true matrices
Usage
relative_error(M_hat, M_true)
Arguments
M_hat |
Estimated matrix |
M_true |
True matrix |
Value
Relative Frobenius norm error
Examples
M_true <- matrix(1:9, 3, 3)
M_hat <- M_true + matrix(rnorm(9, 0, 0.1), 3, 3)
relative_error(M_hat, M_true)
Detect positive and negative transfer sources using ratio method
Description
Detect positive and negative transfer sources using ratio method
Usage
source_detection(
X_sources,
X0,
r,
C,
C2,
data_type = "count",
c_penalty = 0.1,
verbose = TRUE
)
Arguments
X_sources |
List of source data matrices (may contain missing values) |
X0 |
Target data matrix (complete) |
r |
Number of factors |
C |
CJMLE projection constant |
C2 |
Refinement projection constant |
data_type |
Type of data: "continuous", "count", or "binary" |
c_penalty |
Penalty coefficient (default: 0.1) |
verbose |
Print progress information (default: TRUE) |
Value
List with positive_sources, negative_sources, and diagnostic info
Identify potential sources based on rank comparison using IC criterion
Description
Identify potential sources based on rank comparison using IC criterion
Usage
source_potential(
X_sources,
X0,
r_max = 10,
ic_type = "IC1",
data_type = "count",
C = NULL,
max_iter = 30,
verbose = TRUE
)
Arguments
X_sources |
List of source data matrices (may contain missing values) |
X0 |
Target data matrix (may contain missing values) |
r_max |
Maximum number of factors to consider (default: 10) |
ic_type |
IC criterion type: "IC1" or "IC2" (default: "IC1") |
data_type |
Type of data: "continuous", "count", or "binary" |
C |
CJMLE projection constant (if NULL, auto-calculated) |
max_iter |
Maximum CJMLE iterations (default: 30) |
verbose |
Print progress information (default: TRUE) |
Value
List with positive_potential_sources, negative_sources, r_target, r_sources
Examples
# Generate Poisson data
set.seed(2025)
# Generate 5 sources with different ranks
n1 <- 100; p1 <- 100
source_list <- list()
# Sources 1-2: rank 2 (same as target)
r_s <- 2
F_s <- matrix(runif(n1 * r_s, min = -2, max = 2), n1, r_s)
B_s <- matrix(runif(p1 * r_s, min = -2, max = 2), p1, r_s)
M_s <- F_s %*% t(B_s)
for (s in 1:2) {
X_s <- matrix(rpois(n1 * p1, exp(M_s)), n1, p1)
# Add 10% missing values
n_missing <- floor(n1 * p1 * 0.1)
missing_idx <- sample(n1 * p1, n_missing)
X_s[missing_idx] <- NA
source_list[[s]] <- X_s
}
# Sources 3-5: rank 3 (different from target)
for (s in 3:5) {
r_s_nega <- 3
F_s_nega <- matrix(runif(n1 * r_s_nega, min = -2, max = 2), n1, r_s_nega)
B_s_nega <- matrix(runif(p1 * r_s_nega, min = -2, max = 2), p1, r_s_nega)
M_s_nega <- F_s_nega %*% t(B_s_nega)
X_s_nega <- matrix(rpois(n1 * p1, exp(M_s_nega)), n1, p1)
n_missing <- floor(n1 * p1 * 0.1)
missing_idx <- sample(n1 * p1, n_missing)
X_s_nega[missing_idx] <- NA
source_list[[s]] <- X_s_nega
}
# Target data: rank 2
n0 <- 50; p0 <- 50; r_target <- 2
M_target <- M_s[1:n0, 1:p0]
X_target <- matrix(rpois(n0 * p0, exp(M_target)), n0, p0)
# Identify potential sources
result <- source_potential(
X_sources = source_list,
X0 = X_target,
r_max = 5,
ic_type = "IC1",
data_type = "count",
verbose = TRUE
)
print(result$positive_potential_sources) # Should be c(1, 2)
print(result$negative_sources) # Should be c(3, 4, 5)
print(result$r_target) # Should be 2
print(result$r_sources) # Should be c(2, 2, 3, 3, 3)
Single source transfer learning for generalized factor models
Description
Single source transfer learning for generalized factor models
Usage
transGFM(
source_data,
target_data,
r,
data_type = "count",
lambda_seq = seq(0, 10, by = 1),
K_cv = 3,
sigma2 = 1,
max_iter_cjmle = 30,
max_iter_refine = 30,
max_iter_nuclear = 30,
verbose = FALSE
)
Arguments
source_data |
Source data matrix (may contain missing values coded as NA) |
target_data |
Target data matrix (complete) |
r |
Number of factors |
data_type |
Type of data: "continuous", "count", or "binary" |
lambda_seq |
Sequence of lambda values for CV (default: seq(0, 10, by = 1)) |
K_cv |
Number of CV folds (default: 3) |
sigma2 |
Variance parameter for continuous data (default: 1) |
max_iter_cjmle |
Maximum iterations for CJMLE (default: 30) |
max_iter_refine |
Maximum iterations for refinement (default: 30) |
max_iter_nuclear |
Maximum iterations for nuclear MLE (default: 100) |
verbose |
Print progress information (default: FALSE) |
Value
List containing final estimate M_trans and intermediate results
Examples
# Generate Poisson data
set.seed(2025)
# Source data (100 x 100 with 10% missing)
n1 <- 100; p1 <- 100; r <- 2
F_source <- matrix(runif(n1 * r, min = -2, max = 2), n1, r)
B_source <- matrix(runif(p1 * r, min = -2, max = 2), p1, r)
M_source <- F_source %*% t(B_source)
lambda_source <- exp(M_source)
X_source <- matrix(rpois(n1 * p1, as.vector(lambda_source)), n1, p1)
# Add 10% missing values to source
n_missing <- floor(n1 * p1 * 0.1)
missing_idx <- sample(n1 * p1, n_missing)
X_source[missing_idx] <- NA
# Target data (50 x 50, complete)
n0 <- 50; p0 <- 50
M_target_true <- M_source[1:n0, 1:p0]
lambda_target <- exp(M_target_true)
X_target <- matrix(rpois(n0 * p0, as.vector(lambda_target)), n0, p0)
# Run transGFM
result <- transGFM(
source_data = X_source,
target_data = X_target,
r = 2,
data_type = "count",
lambda_seq = seq(0, 5, by = 1),
K_cv = 3,
verbose = FALSE
)
# Check results
print(paste("Optimal lambda:", result$optimal_lambda))
print(paste("Final relative error:",
relative_error(result$M_trans, M_target_true)))
Multiple source transfer learning for generalized factor models
Description
Multiple source transfer learning for generalized factor models
Usage
transGFM_multi(
source_data_list,
target_data,
r,
data_type = "count",
method = "AD",
lambda_seq = seq(0, 10, by = 1),
K_cv = 3,
sigma2 = 1,
max_iter_cjmle = 30,
max_iter_refine = 30,
max_iter_nuclear = 100,
verbose = FALSE
)
Arguments
source_data_list |
List of source data matrices (may contain missing values) |
target_data |
Target data matrix (complete) |
r |
Number of factors |
data_type |
Type of data: "continuous", "count", or "binary" |
method |
Fusion method: "AD" (Average-Debias) or "DA" (Debias-Average) |
lambda_seq |
Sequence of lambda values for CV |
K_cv |
Number of CV folds |
sigma2 |
Variance parameter for continuous data |
max_iter_cjmle |
Maximum iterations for CJMLE |
max_iter_refine |
Maximum iterations for refinement |
max_iter_nuclear |
Maximum iterations for nuclear MLE |
verbose |
Print progress information |
Value
List containing final estimate and intermediate results
Examples
# Generate Poisson data
set.seed(2025)
# Generate 3 source datasets (100 x 100 with different missing rates)
n1 <- 100; p1 <- 100; r <- 2
source_list <- list()
F_s <- matrix(runif(n1 * r, min = -2, max = 2), n1, r)
B_s <- matrix(runif(p1 * r, min = -2, max = 2), p1, r)
M_s <- F_s %*% t(B_s)
for (s in 1:3) {
X_s <- matrix(rpois(n1 * p1, exp(M_s)), n1, p1)
# Add missing values (10%, 12%, 14% for sources 1-3)
missing_rate <- 0.1 + (s - 1) * 0.02
n_missing <- floor(n1 * p1 * missing_rate)
missing_idx <- sample(n1 * p1, n_missing)
X_s[missing_idx] <- NA
source_list[[s]] <- X_s
}
# Target data (50 x 50, complete)
n0 <- 50; p0 <- 50
M_target_true <- M_s[1:n0, 1:p0]
X_target <- matrix(rpois(n0 * p0, exp(M_target_true)), n0, p0)
# Run transGFM_multi with AD method
result_AD <- transGFM_multi(
source_data_list = source_list,
target_data = X_target,
r = 2,
data_type = "count",
method = "AD",
lambda_seq = seq(0, 5, by = 1),
K_cv = 3,
verbose = FALSE
)
# Run transGFM_multi with DA method
result_DA <- transGFM_multi(
source_data_list = source_list,
target_data = X_target,
r = 2,
data_type = "count",
method = "DA",
verbose = FALSE
)
# Compare results
print(paste("AD method error:", relative_error(result_AD$M_trans, M_target_true)))
print(paste("DA method error:", relative_error(result_DA$M_trans, M_target_true)))