的R - 回收NA值以前的非NA值
问题描述:
我有一个数据帧称为input
,看起来像下面这样:的R - 回收NA值以前的非NA值
structure(list(sequence = c("LdBPK_010012800.1", "MAQNDKIAPQDQDSF",
"AQNDKIAPQDQDSFL", "QNDKIAPQDQDSFLD", "NDKIAPQDQDSFLDD", "DKIAPQDQDSFLDDQ",
"KIAPQDQDSFLDDQP", "IAPQDQDSFLDDQPG", "APQDQDSFLDDQPGV", "PQDQDSFLDDQPGVR",
"LdBPK_020009000.1", "MAQNDKIAPQDQDSF", "AQNDKIAPQDQDSFL", "QNDKIAPQDQDSFLD",
"NDKIAPQDQDSFLDD", "DKIAPQDQDSFLDDQ", "KIAPQDQDSFLDDQP", "IAPQDQDSFLDDQPG",
"APQDQDSFLDDQPGV", "PQDQDSFLDDQPGVR"), score = c(1, 17007, 12388,
15984, 23405, 31897, 26826, 35239, 35361, 36486, 1, 17007, 12388,
15984, 23405, 31897, 26826, 35239, 35361, 36486), epitope = structure(c(1L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("", "Epitope", "Non-Epitope"), class = "factor"),
positioning = c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE), accessions = c("LdBPK_010012800.1",
NA, NA, NA, NA, NA, NA, NA, NA, NA, "LdBPK_020009000.1",
NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -20L
), .Names = c("sequence", "score", "epitope", "positioning",
"accessions"), class = "data.frame")
(其实我的原始数据帧有超过100万行,所以这只是它的一小部分)
我想input$accessions
下回收非NA的值(与LdBPK_010012800.1开始),直到我发现下一个非NA值(考虑本示例中,LdBPK_020009000.1)。然后我将回收低于LdBPK_020009000.1的NA值,直到遇到下一个非NA值,依此类推。
此操作后,我的新的数据帧应该是这样的:
structure(list(sequence = c("LdBPK_010012800.1", "MAQNDKIAPQDQDSF",
"AQNDKIAPQDQDSFL", "QNDKIAPQDQDSFLD", "NDKIAPQDQDSFLDD", "DKIAPQDQDSFLDDQ",
"KIAPQDQDSFLDDQP", "IAPQDQDSFLDDQPG", "APQDQDSFLDDQPGV", "PQDQDSFLDDQPGVR",
"LdBPK_020009000.1", "MAQNDKIAPQDQDSF", "AQNDKIAPQDQDSFL", "QNDKIAPQDQDSFLD",
"NDKIAPQDQDSFLDD", "DKIAPQDQDSFLDDQ", "KIAPQDQDSFLDDQP", "IAPQDQDSFLDDQPG",
"APQDQDSFLDDQPGV", "PQDQDSFLDDQPGVR"), score = c(1, 17007, 12388,
15984, 23405, 31897, 26826, 35239, 35361, 36486, 1, 17007, 12388,
15984, 23405, 31897, 26826, 35239, 35361, 36486), epitope = structure(c(1L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("", "Epitope", "Non-Epitope"), class = "factor"),
positioning = c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE, FALSE, FALSE, FALSE), accessions = c("LdBPK_010012800.1",
"LdBPK_010012800.1", "LdBPK_010012800.1", "LdBPK_010012800.1",
"LdBPK_010012800.1", "LdBPK_010012800.1", "LdBPK_010012800.1",
"LdBPK_010012800.1", "LdBPK_010012800.1", "LdBPK_010012800.1",
"LdBPK_020009000.1", "LdBPK_020009000.1", "LdBPK_020009000.1",
"LdBPK_020009000.1", "LdBPK_020009000.1", "LdBPK_020009000.1",
"LdBPK_020009000.1", "LdBPK_020009000.1", "LdBPK_020009000.1",
"LdBPK_020009000.1")), row.names = c(NA, -20L), .Names = c("sequence",
"score", "epitope", "positioning", "accessions"), class = "data.frame")
我这样做,因为我的最终目标是通过accessions
使用dplyr
分组和score
答
我们可以用fill
library(tidyverse)
df1 %>%
fill(accessions)