Data Science

Read a tab-separated file (.text) into R

The key argument we should correctly specify is sep, and the way to present the tab character in R is to type sep = "\t". The backslash is the escape character 转义符. It means the character following \ is special. See also

The rows are seen as the duplicate as long as the frequency of elements in these rows is identical. How can we remove duplicate rows?

df= tibble(x= c(0, 18, 4, 9, 88), y= c(4, 9, 0, 18, 40)) 
## # A tibble: 5 × 2
##       x     y
##   <dbl> <dbl>
## 1     0     4
## 2    18     9
## 3     4     0
## 4     9    18
## 5    88    40
df %>% 
  mutate(z= map2(x, y, ~ c(.x, .y) |> sort())) %>% 
  distinct(z, .keep_all = TRUE)
## # A tibble: 3 × 3
##       x     y z        
##   <dbl> <dbl> <list>   
## 1     0     4 <dbl [2]>
## 2    18     9 <dbl [2]>
## 3    88    40 <dbl [2]>

1 分组操作


tibble(col= c("A", "A", "B", "B", "C")) %>% 
  group_by(col) %>% 
  mutate(grp = cur_group_id())
## # A tibble: 5 × 2
## # Groups:   col [3]
##   col     grp
##   <chr> <int>
## 1 A         1
## 2 A         1
## 3 B         2
## 4 B         2
## 5 C         3


# 注释是黑体吗
df = read_csv("F:/Learning_materials/R/正则/Demo_t.test.csv")
## # A tibble: 38 × 7
##    compoundID case_1 case_2 case_3 control_1 control_2 control_3
##    <chr>       <dbl>  <dbl>  <dbl>     <dbl>     <dbl>     <dbl>
##  1 com_001       485    154    268       350       432       425
##  2 com_002       208    372    219       457       324       392
##  3 com_003       219    125    345       473       480       403
##  4 com_004       289    356    116       489       376       500
##  5 com_005       248    456    279       457       426       436
##  6 com_006       323    142    462       451       354       452
##  7 com_007       259    148    374       397       346       383
##  8 com_008       428    262    226       436       499       308
##  9 com_009       327    494    244       316       368       401
## 10 com_010       480    343    495       383       471       387
## # ℹ 28 more rows
df= df %>% 
  pivot_longer(-1, names_pattern = "(.*)_", names_to = c(".value")) %>% 
  pivot_longer(-1, names_to = "trt", values_to = "val")
## # A tibble: 228 × 3
##    compoundID trt       val
##    <chr>      <chr>   <dbl>
##  1 com_001    case      485
##  2 com_001    control   350
##  3 com_001    case      154
##  4 com_001    control   432
##  5 com_001    case      268
##  6 com_001    control   425
##  7 com_002    case      208
##  8 com_002    control   457
##  9 com_002    case      372
## 10 com_002    control   324
## # ℹ 218 more rows
df %>% 
  group_by(compoundID) %>% 
  t_test(val ~ trt, detailed = TRUE)
## # A tibble: 38 × 16
##    compoundID estimate estimate1 estimate2 .y.   group1 group2     n1    n2
##  * <chr>         <dbl>     <dbl>     <dbl> <chr> <chr>  <chr>   <int> <int>
##  1 com_001     -100         302.      402. val   case   control     3     3
##  2 com_002     -125.        266.      391  val   case   control     3     3
##  3 com_003     -222.        230.      452  val   case   control     3     3
##  4 com_004     -201.        254.      455  val   case   control     3     3
##  5 com_005     -112         328.      440. val   case   control     3     3
##  6 com_006     -110         309       419  val   case   control     3     3
##  7 com_007     -115         260.      375. val   case   control     3     3
##  8 com_008     -109         305.      414. val   case   control     3     3
##  9 com_009       -6.67      355       362. val   case   control     3     3
## 10 com_010       25.7       439.      414. val   case   control     3     3
## # ℹ 28 more rows
## # ℹ 7 more variables: statistic <dbl>, p <dbl>, df <dbl>, conf.low <dbl>,
## #   conf.high <dbl>, method <chr>, alternative <chr>


group_modify() returns a grouped tibble. In that case .f must return a data frame.

  1. 计算每组中所有变量的最小值
  2. 并将结果分别添加在各组的最后面
df= mtcars[2:10, 1:4] 
##                    mpg cyl  disp  hp
## Mazda RX4 Wag     21.0   6 160.0 110
## Datsun 710        22.8   4 108.0  93
## Hornet 4 Drive    21.4   6 258.0 110
## Hornet Sportabout 18.7   8 360.0 175
## Valiant           18.1   6 225.0 105
## Duster 360        14.3   8 360.0 245
## Merc 240D         24.4   4 146.7  62
## Merc 230          22.8   4 140.8  95
## Merc 280          19.2   6 167.6 123
unique(df$cyl) # cyl, has 3 levels
## [1] 6 4 8
df %>% 
  group_by(cyl) %>% 
  summarise(across(.fns = min))
## # A tibble: 3 × 4
##     cyl   mpg  disp    hp
##   <dbl> <dbl> <dbl> <dbl>
## 1     4  22.8   108    62
## 2     6  18.1   160   105
## 3     8  14.3   360   175
df %>% 
  group_by(cyl) %>% 
  group_modify(., ~ .x %>% bind_rows(apply(.x, 2, min)))
## # A tibble: 12 × 4
## # Groups:   cyl [3]
##      cyl   mpg  disp    hp
##    <dbl> <dbl> <dbl> <dbl>
##  1     4  22.8  108     93
##  2     4  24.4  147.    62
##  3     4  22.8  141.    95
##  4     4  22.8  108     62
##  5     6  21    160    110
##  6     6  21.4  258    110
##  7     6  18.1  225    105
##  8     6  19.2  168.   123
##  9     6  18.1  160    105
## 10     8  18.7  360    175
## 11     8  14.3  360    245
## 12     8  14.3  360    175




  • apply(.x, 2, min)计算每个变量的最小值,得到结果;
  • 再用bind_rows把(每组)数据本身和结果合并起来;


bind_rows() and bind_cols() return the same type as the first input.



head(economics, 3)
## # A tibble: 3 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
economics %>% 
  group_by(ym= tsibble::yearmonth(date)) %>% 
  summarise(pce= mean(pce))
## # A tibble: 574 × 2
##           ym   pce
##        <mth> <dbl>
##  1  1967 7月  507.
##  2  1967 8月  510.
##  3  1967 9月  516.
##  4 1967 10月  512.
##  5 1967 11月  517.
##  6 1967 12月  525.
##  7  1968 1月  531.
##  8  1968 2月  534.
##  9  1968 3月  544.
## 10  1968 4月  544 
## # ℹ 564 more rows

2 同时操作多列



The parameters of across function are .col, .fns, ..., respectively. Generally, we should input the parameters in order if we want to omit the names of these parameters.

The first parameter “.col” has a default value (i.e. everything() ). If we don’t need to modify “.col”, we can completely omit it.

The following function (~ str_replace_all() ), however, will be mistakenly identified as “.col” parameter due to it being input first. So we must add the corresponding name to that parameter, .fns = ~ str_replace_all(). 如果不加.fns=,函数会被误认为是.col的参数,导致出错。

##    A    B
## 1 X. X..X
## 2 Y. Y..Y
df %>% 
  mutate(across(.fns= ~ str_replace_all(.x, "\\.", "0")))
##    A    B
## 1 X0 X00X
## 2 Y0 Y00Y
df %>% 
  mutate(across(.fns= str_replace_all, pattern= "\\.", replacement= "0"))
##    A    B
## 1 X0 X00X
## 2 Y0 Y00Y

3 操作行/筛选行

  1. Remove row containing repeat elements
df= data.frame(A= 1:4, B= c(2, 3, 4, 3), C= c(10, 10, 4, 1), D= c(4, 2, 4, 6))
##   A B  C D
## 1 1 2 10 4
## 2 2 3 10 2
## 3 3 4  4 4
## 4 4 3  1 6
df %>% 
   filter(! pmap_lgl(., ~ duplicated(c(...)) %>% any() ))
##   A B  C D
## 1 1 2 10 4
## 2 4 3  1 6
df %>% 
   filter(pmap_lgl(., ~ length(unique(c(...))) == length(c(...)) ))
##   A B  C D
## 1 1 2 10 4
## 2 4 3  1 6
  1. Repeat each row N times

The first row is duplicated twice; Second and third row repeat three times and once, respectively.

df= tibble(A= c(0.56, 4.33, 5.81), N= c(2, 3, 1))
## # A tibble: 3 × 2
##       A     N
##   <dbl> <dbl>
## 1  0.56     2
## 2  4.33     3
## 3  5.81     1
df %>% 
  slice(rep(1:n(), times= N)) # slice(1, 1, 2, 2, 2, 3)
## # A tibble: 6 × 2
##       A     N
##   <dbl> <dbl>
## 1  0.56     2
## 2  0.56     2
## 3  4.33     3
## 4  4.33     3
## 5  4.33     3
## 6  5.81     1
df[rep(1:nrow(df), df$N), ] # Basic syntax
## # A tibble: 6 × 2
##       A     N
##   <dbl> <dbl>
## 1  0.56     2
## 2  0.56     2
## 3  4.33     3
## 4  4.33     3
## 5  4.33     3
## 6  5.81     1
  1. Merge the elements of columns into one column, excluding NA.
## # A tibble: 4 × 4
##       A     B     C     D
##   <int> <dbl> <dbl> <dbl>
## 1     1     2    10     4
## 2     2    NA    10     2
## 3    NA     4     4     4
## 4     4    NA     1    NA
f= function(x) {
  x[!] %>% 
    paste0(., collapse= "-")
df %>% 
  mutate(new= pmap_chr(., ~ f(c(...))))
## # A tibble: 4 × 5
##       A     B     C     D new     
##   <int> <dbl> <dbl> <dbl> <chr>   
## 1     1     2    10     4 1-2-10-4
## 2     2    NA    10     2 2-10-2  
## 3    NA     4     4     4 4-4-4   
## 4     4    NA     1    NA 4-1
  1. Replace the last non-NA value of each row with NA
df= tibble(A= c(200.79, NA, 193.2, NA), B= c(NA, NA, "C9LL", "WP45"), C= NA, D= c(4.326, NA, NA, NA))
## # A tibble: 4 × 4
##       A B     C         D
##   <dbl> <chr> <lgl> <dbl>
## 1  201. <NA>  NA     4.33
## 2   NA  <NA>  NA    NA   
## 3  193. C9LL  NA    NA   
## 4   NA  WP45  NA    NA
f= function(x) {
  if (all( x
  else {
    n= length(x)
    while([n])) n= n-1
    x[n]= NA
df %>% 
  pmap_dfr(., ~ f(c(...)))
## # A tibble: 4 × 4
##   A      B     C     D    
##   <chr>  <chr> <chr> <chr>
## 1 200.79 <NA>  <NA>  <NA> 
## 2 <NA>   <NA>  <NA>  <NA> 
## 3 193.2  <NA>  <NA>  <NA> 
## 4 <NA>   <NA>  <NA>  <NA>

4 操作列/筛选列

  1. Remove the column that all elements are “AAA”
df= tibble(x= rep("AAA", 5), y = 1:5, z= c(rep("AAA", 3), "b", "c"))
## # A tibble: 5 × 3
##   x         y z    
##   <chr> <int> <chr>
## 1 AAA       1 AAA  
## 2 AAA       2 AAA  
## 3 AAA       3 AAA  
## 4 AAA       4 b    
## 5 AAA       5 c
df %>% 
  select(where(~ !all(.x == "AAA")))
## # A tibble: 5 × 2
##       y z    
##   <int> <chr>
## 1     1 AAA  
## 2     2 AAA  
## 3     3 AAA  
## 4     4 b    
## 5     5 c

.x == “AAA”是判断语句,判断每个列向量是否等于”AAA”,返回的结果是与列向量等长度的逻辑向量。比如z列的结果是T T T F F。


而我们想要的结果相反,是想筛除x列,保留y z列。所以我们用!符号,反向选择。

  1. Remove the column where all elements are NA
## # A tibble: 3 × 4
##   x         y z     w    
##   <lgl> <int> <lgl> <chr>
## 1 NA        1 NA    <NA> 
## 2 NA        2 NA    B    
## 3 NA        3 NA    C
df %>% 
  select(where(~ !all(
## # A tibble: 3 × 2
##       y w    
##   <int> <chr>
## 1     1 <NA> 
## 2     2 B    
## 3     3 C

5 正则表达式

  • 找出数值
  • 找出紧跟在b后的数值
  • 找出b后面出现的数值??
tt = c("ab1", "vf2", "aaba2", "dd9b76", "d8p", "a0b3e4")
str_extract_all(tt, "\\d+") 
## [[1]]
## [1] "1"
## [[2]]
## [1] "2"
## [[3]]
## [1] "2"
## [[4]]
## [1] "9"  "76"
## [[5]]
## [1] "8"
## [[6]]
## [1] "0" "3" "4"
str_extract(tt, "(?<=b)\\d+")
## [1] "1"  NA   NA   "76" NA   "3"

5.1 za




TEST <- tibble(a_AD = c('1,2','0,2,3','2,0','0,0,2,3'), 
b_AD = c('1,2','0,0,2,3','0,2,0,3','2,0'))
## # A tibble: 4 × 2
##   a_AD    b_AD   
##   <chr>   <chr>  
## 1 1,2     1,2    
## 2 0,2,3   0,0,2,3
## 3 2,0     0,2,0,3
## 4 0,0,2,3 2,0
TEST %>% 
  mutate(across(1:2, ~ str_split(.x, ",") %>% 
                  map(as.numeric) %>% 
                  map_dbl(~ sum(.x[-1]))))
## # A tibble: 4 × 2
##    a_AD  b_AD
##   <dbl> <dbl>
## 1     2     2
## 2     5     5
## 3     0     5
## 4     5     0



df = tibble(x = as.Date(c("2005/1--20", "2018/9--3"), 
                        format = "%Y/%m--%d"))
## # A tibble: 2 × 1
##   x         
##   <date>    
## 1 2005-01-20
## 2 2018-09-03
df %>% 
  mutate(y = if_else(x <= as.Date("2005-1-20"), x, NA_Date_))
## # A tibble: 2 × 2
##   x          y         
##   <date>     <date>    
## 1 2005-01-20 2005-01-20
## 2 2018-09-03 NA




trace(rstatix:::as_tidy_cor, edit = TRUE)