如何使用for循环创建一个变量过滤的数据集列表,然后在每个数据集中搜索重复项

wkftcu5l  于 2023-03-27  发布在  其他
关注(0)|答案(4)|浏览(99)
structure(list(finess = c("450010079", "450000245", "450000245", 
"570005165", "300788502", "300788502", "940300270", "290004142", 
"350005179", "350005179", "130786049", "020004404", "020004404", 
"020004404", "020004404", "630781839", "570005165", "570005165", 
"350005179", "350005179", "130786049", "130786049", "630781839", 
"570000646", "570000646", "770300010", "500000146", "710780644", 
"310780382", "310780382"), ghm2 = c("06K02Z", "02C05J", "02C05J", 
"10M092", "06M20T", "16M111", "13K02Z", "06K04J", "04M133", "04M15T", 
"10C111", "06K02Z", "01C15J", "05M122", "06M121", "02C02J", "10C131", 
"23M20Z", "06C071", "04M22Z", "17K091", "10C111", "02C05J", "02C05J", 
"02C05J", "90Z00Z", "06K02Z", "06M032", "05M15T", "05K102"), 
    dp = c("Z8000", "H251", "H251", "E6686", "D128", "D508", 
    "N840", "K293", "J952", "J9868", "C73", "K635", "G560", "I712", 
    "R101", "H358", "E6686", "R798", "Z432", "Z098", "Z5101", 
    "C73", "H251", "H258", "H258", "RSSABS", "D125", "K291", 
    "I10", "I209"), dr = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "J961+1", "C73", 
    NA, NA, NA, NA, "XXXX", NA, NA, NA, NA), sexe = c(1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 
    1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L), modeEntree = c("8", 
    "8", "8", "8", "8", "8", "8", "8", "7", "8", "8", "8", "8", 
    "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", 
    "x", "8", "8", "8", "8"), annee = c(2018L, 2018L, 2018L, 
    2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 
    2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2020L, 
    2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L
    )), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"
))

我想创建一个函数来查看数据库中是否每年都有重复的数据

mylist<-list()
for(i in 2018:2020){
  temp<-mydata%>%filter(annee==i)
  mylist[[i]]<-table(duplicated(temp))
}

我想拥有这样的东西

[[2018]]
FALSE  TRUE 
    9     1 

[[2019]]
FALSE  
   10     

[[2020]]
FALSE  TRUE 
    9     1

但我得到了一个奇怪的输出

[[668]]
NULL

[[669]]
NULL

[[670]]
NULL

[[671]]
NULL

[[672]]
NULL

[[673]]
NULL

[[674]]
NULL

[[675]]
NULL

[[676]]
NULL

[[677]]
NULL

[[678]]
NULL

[[679]]
NULL

[[680]]
NULL

[[681]]
NULL

[[682]]
NULL

[[683]]
NULL

[[684]]
NULL

[[685]]
NULL

[[686]]
NULL

[[687]]
NULL

[[688]]
NULL

[[689]]
NULL

[[690]]
NULL

[[691]]
NULL

[[692]]
NULL

[[693]]
NULL

[[694]]
NULL

[[695]]
NULL

[[696]]
NULL

[[697]]
NULL

[[698]]
NULL

[[699]]
NULL

[[700]]
NULL

[[701]]
NULL

[[702]]
NULL

[[703]]
NULL

[[704]]
NULL

[[705]]
NULL

[[706]]
NULL

[[707]]
NULL

[[708]]
NULL

[[709]]
NULL

[[710]]
NULL

[[711]]
NULL

[[712]]
NULL

[[713]]
NULL

[[714]]
NULL

[[715]]
NULL

[[716]]
NULL

[[717]]
NULL

[[718]]
NULL

[[719]]
NULL

[[720]]
NULL

[[721]]
NULL

[[722]]
NULL

[[723]]
NULL

[[724]]
NULL

[[725]]
NULL

[[726]]
NULL

[[727]]
NULL

[[728]]
NULL

[[729]]
NULL

[[730]]
NULL

[[731]]
NULL

[[732]]
NULL

[[733]]
NULL

[[734]]
NULL

[[735]]
NULL

[[736]]
NULL

[[737]]
NULL

[[738]]
NULL

[[739]]
NULL

[[740]]
NULL

[[741]]
NULL

[[742]]
NULL

[[743]]
NULL

[[744]]
NULL

[[745]]
NULL

[[746]]
NULL

[[747]]
NULL

[[748]]
NULL

[[749]]
NULL

[[750]]
NULL

[[751]]
NULL

[[752]]
NULL

[[753]]
NULL

[[754]]
NULL

[[755]]
NULL

[[756]]
NULL

[[757]]
NULL

[[758]]
NULL

[[759]]
NULL

[[760]]
NULL

[[761]]
NULL

[[762]]
NULL

[[763]]
NULL

[[764]]
NULL

[[765]]
NULL

[[766]]
NULL

[[767]]
NULL

[[768]]
NULL

[[769]]
NULL

[[770]]
NULL

[[771]]
NULL

[[772]]
NULL

[[773]]
NULL

[[774]]
NULL

[[775]]
NULL

[[776]]
NULL

[[777]]
NULL

[[778]]
NULL

[[779]]
NULL

[[780]]
NULL

[[781]]
NULL

[[782]]
NULL

[[783]]
NULL

[[784]]
NULL

[[785]]
NULL

[[786]]
NULL

[[787]]
NULL

[[788]]
NULL

[[789]]
NULL

[[790]]
NULL

[[791]]
NULL

[[792]]
NULL

[[793]]
NULL

[[794]]
NULL

[[795]]
NULL

[[796]]
NULL

[[797]]
NULL

[[798]]
NULL

[[799]]
NULL

[[800]]
NULL

[[801]]
NULL

[[802]]
NULL

[[803]]
NULL

[[804]]
NULL

[[805]]
NULL

[[806]]
NULL

[[807]]
NULL

[[808]]
NULL

[[809]]
NULL

[[810]]
NULL

[[811]]
NULL

[[812]]
NULL

[[813]]
NULL

[[814]]
NULL

[[815]]
NULL

[[816]]
NULL

[[817]]
NULL

[[818]]
NULL

[[819]]
NULL

[[820]]
NULL

[[821]]
NULL

[[822]]
NULL

[[823]]
NULL

[[824]]
NULL

[[825]]
NULL

[[826]]
NULL

[[827]]
NULL

[[828]]
NULL

[[829]]
NULL

[[830]]
NULL

[[831]]
NULL

[[832]]
NULL

[[833]]
NULL

[[834]]
NULL

[[835]]
NULL

[[836]]
NULL

[[837]]
NULL

[[838]]
NULL

[[839]]
NULL

[[840]]
NULL

[[841]]
NULL

[[842]]
NULL

[[843]]
NULL

[[844]]
NULL

[[845]]
NULL

[[846]]
NULL

[[847]]
NULL

[[848]]
NULL

[[849]]
NULL

[[850]]
NULL

[[851]]
NULL

[[852]]
NULL

[[853]]
NULL

[[854]]
NULL

[[855]]
NULL

[[856]]
NULL

[[857]]
NULL

[[858]]
NULL

[[859]]
NULL

[[860]]
NULL

[[861]]
NULL

[[862]]
NULL

[[863]]
NULL

[[864]]
NULL

[[865]]
NULL

[[866]]
NULL

[[867]]
NULL

[[868]]
NULL

[[869]]
NULL

[[870]]
NULL

[[871]]
NULL

[[872]]
NULL

[[873]]
NULL

[[874]]
NULL

[[875]]
NULL

[[876]]
NULL

[[877]]
NULL

[[878]]
NULL

[[879]]
NULL

[[880]]
NULL

[[881]]
NULL

[[882]]
NULL

[[883]]
NULL

[[884]]
NULL

[[885]]
NULL

[[886]]
NULL

[[887]]
NULL

[[888]]
NULL

[[889]]
NULL

[[890]]
NULL

[[891]]
NULL
nhhxz33t

nhhxz33t1#

如果你想在列表中找到重复的行,可以使用group_split()hablar::find_duplicates

library(dplyr)                                                                                                                                                                   

df %>% 
  group_split(annee) %>% 
  purrr::map(hablar::find_duplicates)

输出

[[1]]
# A tibble: 2 × 7
  finess    ghm2   dp    dr     sexe modeEntree annee
  <chr>     <chr>  <chr> <chr> <int> <chr>      <int>
1 450000245 02C05J H251  NA        1 8           2018
2 450000245 02C05J H251  NA        1 8           2018

[[2]]
# A tibble: 0 × 7
# … with 7 variables: finess <chr>, ghm2 <chr>, dp <chr>, dr <chr>, sexe <int>,
#   modeEntree <chr>, annee <int>
# ℹ Use `colnames()` to see all variable names

[[3]]
# A tibble: 2 × 7
  finess    ghm2   dp    dr     sexe modeEntree annee
  <chr>     <chr>  <chr> <chr> <int> <chr>      <int>
1 570000646 02C05J H258  NA        2 8           2020
2 570000646 02C05J H258  NA        2 8           2020
cxfofazt

cxfofazt2#

你的代码确实可以工作,但是当你设置一个列表的第2018个元素时,R会将之前的所有2017个元素都创建为NULL。所以打印整个列表会显示很多NULL元素。
你可以只打印你想要的:

> mylist[2018:2020]
[[1]]

FALSE  TRUE 
    9     1 

[[2]]

FALSE 
   10 

[[3]]

FALSE  TRUE 
    9     1

或者你可以使用一个字符来索引它们:

mylist<-list()
for(i in 2018:2020){
  temp<-mydata%>%filter(annee==i)
  mylist[[as.character(i)]]<-table(duplicated(temp))
}
mylist
$`2018`

FALSE  TRUE 
    9     1 

$`2019`

FALSE 
   10 

$`2020`

FALSE  TRUE 
    9     1

最后,我不知道你为什么要这样做,要将每组重复行的数量制成表格,这样做可能更容易:

by(mydata , mydata$annee , \(d) table(duplicated(d)))

mydata$annee: 2018

FALSE  TRUE 
    9     1 
---------------------------------------------------------------------- 
mydata$annee: 2019

FALSE 
   10 
---------------------------------------------------------------------- 
mydata$annee: 2020

FALSE  TRUE 
    9     1

或者如果你想使用data.table,你可以做一个交叉表:

setDT(mydata)
dcast(mydata[ , .(duplicated=duplicated(.SD)), by=annee], duplicated~annee, length)

   duplicated 2018 2019 2020
1:      FALSE    9   10    9
2:       TRUE    1    0    1
i2byvkas

i2byvkas3#

你也可以使用我的函数fduplicates(),它使用了tidyverse数据屏蔽规则,就像dplyr::distinct()和friends一样。
它还可以优雅地与许多组一起工作,因为它在幕后使用data.table。

# remotes::install_github("NicChr/timeplyr")
library(timeplyr)
library(dplyr)
df %>%
  group_by(annee) %>%
  fduplicates(.both_ways = TRUE, # Duplicates + first non-duplicate
              .add_count = TRUE) %>% # Duplicate count
  group_split()
#> [[1]]
#> # A tibble: 2 x 8
#>   annee finess    ghm2   dp    dr     sexe modeEntree     n
#>   <int> <chr>     <chr>  <chr> <chr> <int> <chr>      <int>
#> 1  2018 450000245 02C05J H251  <NA>      1 8              2
#> 2  2018 450000245 02C05J H251  <NA>      1 8              2
#> 
#> [[2]]
#> # A tibble: 2 x 8
#>   annee finess    ghm2   dp    dr     sexe modeEntree     n
#>   <int> <chr>     <chr>  <chr> <chr> <int> <chr>      <int>
#> 1  2020 570000646 02C05J H258  <NA>      2 8              2
#> 2  2020 570000646 02C05J H258  <NA>      2 8              2

# Alernatively..
df %>%
  group_by(annee) %>%
  summarise(n = n(), 
            n_dupes = nrow(fduplicates(pick(everything()), 
                                       .both_ways = TRUE)),
            n_non_dupes = n - n_dupes)
#> # A tibble: 3 x 4
#>   annee     n n_dupes n_non_dupes
#>   <int> <int>   <int>       <int>
#> 1  2018    10       2           8
#> 2  2019    10       0          10
#> 3  2020    10       2           8

创建于2023-03-25带有reprex v2.0.2

8ehkhllq

8ehkhllq4#

你得到了正确的结果,但是你将元素位置2018赋值给2020。为了使你的方法更可读,在赋值时减去2017

library(dplyr)

mylist<-list()
for(i in 2018:2020){
  temp <- mydata%>%filter(annee==i)
  mylist[[i - 2017]] <- table(duplicated(temp))
}

mylist
#> [[1]]
#> 
#> FALSE  TRUE 
#>     9     1 
#> 
#> [[2]]
#> 
#> FALSE 
#>    10 
#> 
#> [[3]]
#> 
#> FALSE  TRUE 
#>     9     1

因为你使用的是'dplyr',我们也可以使用dplyr::group_map()来得到相同的结果:

mydata %>% 
  group_by(annee) %>% 
  group_map(~ duplicated(.x) %>% 
              table())
#> [[1]]
#> .
#> FALSE  TRUE 
#>     9     1 
#> 
#> [[2]]
#> .
#> FALSE 
#>    10 
#> 
#> [[3]]
#> .
#> FALSE  TRUE 
#>     9     1

创建于2023-03-22带有reprex v2.0.2
数据来自OP

mydata <- structure(list(finess = c("450010079", "450000245", "450000245", 
                          "570005165", "300788502", "300788502", "940300270", "290004142", 
                          "350005179", "350005179", "130786049", "020004404", "020004404", 
                          "020004404", "020004404", "630781839", "570005165", "570005165", 
                          "350005179", "350005179", "130786049", "130786049", "630781839", 
                          "570000646", "570000646", "770300010", "500000146", "710780644", 
                          "310780382", "310780382"), ghm2 = c("06K02Z", "02C05J", "02C05J", 
                                                              "10M092", "06M20T", "16M111", "13K02Z", "06K04J", "04M133", "04M15T", 
                                                              "10C111", "06K02Z", "01C15J", "05M122", "06M121", "02C02J", "10C131", 
                                                              "23M20Z", "06C071", "04M22Z", "17K091", "10C111", "02C05J", "02C05J", 
                                                              "02C05J", "90Z00Z", "06K02Z", "06M032", "05M15T", "05K102"), 
               dp = c("Z8000", "H251", "H251", "E6686", "D128", "D508", 
                      "N840", "K293", "J952", "J9868", "C73", "K635", "G560", "I712", 
                      "R101", "H358", "E6686", "R798", "Z432", "Z098", "Z5101", 
                      "C73", "H251", "H258", "H258", "RSSABS", "D125", "K291", 
                      "I10", "I209"), dr = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                             NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "J961+1", "C73", 
                                             NA, NA, NA, NA, "XXXX", NA, NA, NA, NA), sexe = c(1L, 1L, 
                                                                                               1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 
                                                                                               1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L), modeEntree = c("8", 
                                                                                                                                                                   "8", "8", "8", "8", "8", "8", "8", "7", "8", "8", "8", "8", 
                                                                                                                                                                   "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", "8", 
                                                                                                                                                                   "x", "8", "8", "8", "8"), annee = c(2018L, 2018L, 2018L, 
                                                                                                                                                                                                       2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 
                                                                                                                                                                                                       2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2020L, 
                                                                                                                                                                                                       2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L
                                                                                                                                                                   )), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"
                                                                                                                                                                   ))

创建于2023-03-22带有reprex v2.0.2

相关问题