R:不同 Dataframe 之间的通信

jv4diomz  于 2023-03-27  发布在  其他
关注(0)|答案(2)|浏览(109)

在R中,我有两个包含随机坐标的 Dataframe :

# Load the geosphere library
library(geosphere)

set.seed(123)
# Set the number of rows in each data frame
n1 <- 20
n2 <- 30

# Set the mean longitude and latitude for New York
lon_mean <- -74.0060
lat_mean <- 40.7128

# Set the standard deviation for the longitude and latitude
lon_sd <- 0.1
lat_sd <- 0.1

# Simulate random data for df_1
df_1 <- data.frame(
  lon = rnorm(n1, lon_mean, lon_sd),
  lat = rnorm(n1, lat_mean, lat_sd)
)

# Simulate random data for df_2
df_2 <- data.frame(
  lon = rnorm(n2, lon_mean, lon_sd),
  lat = rnorm(n2, lat_mean, lat_sd)
)

# Remove duplicate rows from df_1
df_1_unique <- unique(df_1)

**我的问题:**对于df_1中的每个唯一坐标,我想计算该坐标与df_2中所有坐标之间的地理距离。

我试着用一个循环程序来实现:

# Initialize an empty list to store the result
result <- vector("list", nrow(df_1))

# Calculate the distances
for (i in 1:nrow(df_1)) {
  # Initialize an empty vector to store the distances for the i-th coordinate in df_1
  distances <- numeric(nrow(df_2))
  
  for (j in 1:nrow(df_2)) {
    # Calculate the distance between the i-th coordinate in df_1 and the j-th coordinate in df_2
    distances[j] <- distHaversine(c(df_1[i, "lon"], df_1[i, "lat"]), c(df_2[j, "lon"], df_2[j, "lat"]))
  }
  
  # Store the distances for the i-th coordinate in df_1 in the result list
  result[[i]] <- distances
print(result[[i]])
}

#####

end = list()
for (i in 1:length(result))
{
    end[[i]] = data.frame(id = seq_along(result[[i]]), values = result[[i]], iteration = i)

}

final <- do.call(rbind, end)

现在,对于这个最终结果,我尝试添加4列:

  • 长1
  • lat_1
  • 长2
  • lat_2

也就是说,当final$iteration == i时,则long_1 = df_1[i,1]lat_1 = df_1[1,i]。并且,long_2 = df_2[1,j]lat_1 = df_1[j,1]
我试着用下面的代码来完成这个任务:

# get unique values of iteration
iterations <- unique(final$iteration)

# create new columns for each unique value of iteration
for (i in iterations) {
  final$long_1[final$iteration == i] <- df_1[i, 1]
  final$lat_1[final$iteration == i] <- df_1[i, 2]
  final$long_2[final$iteration == i] <- df_2[final$id[final$iteration == i], 1]
  final$lat_2[final$iteration == i] <- df_2[final$id[final$iteration == i], 2]
}

有人可以告诉我,如果我这样做是正确的吗?

谢谢!

cqoc49vn

cqoc49vn1#

我们可以使用crossing来扩展两个数据集,而不是执行多个循环,使用rowwise属性应用distHaversine

library(dplyr)
library(tidyr)
library(geosphere)
library(stringr)
result2 <- crossing(df_1, df_2 %>%
  rename_with(~ str_c(.x, "_2"))) %>%
  rowwise %>%
  mutate(distances = distHaversine(c(lon, lat), c(lon_2, lat_2))) %>% 
  ungroup
  • 输出
result2
# A tibble: 600 × 5
     lon   lat lon_2 lat_2 distances
   <dbl> <dbl> <dbl> <dbl>     <dbl>
 1 -74.2  40.7 -74.2  40.8    13372.
 2 -74.2  40.7 -74.1  40.8    13275.
 3 -74.2  40.7 -74.1  40.8    14040.
 4 -74.2  40.7 -74.1  40.8    17541.
 5 -74.2  40.7 -74.1  40.7    10186.
 6 -74.2  40.7 -74.1  40.7    11751.
 7 -74.2  40.7 -74.1  40.8    14096.
 8 -74.2  40.7 -74.1  40.6    18080.
 9 -74.2  40.7 -74.0  40.7    13429.
10 -74.2  40.7 -74.0  40.7    14181.
# … with 590 more rows

如果我们使用嵌套的for循环,则可以在单个循环中完成

result <- data.frame()

# Calculate the distances
for (i in 1:nrow(df_1)) {  
  for (j in 1:nrow(df_2)) {    
    tmp <- distHaversine(c(df_1[i, "lon"], df_1[i, "lat"]), 
           c(df_2[j, "lon"], df_2[j, "lat"]))
    result <- rbind(result, data.frame(lon_1 = df_1$lon[i],
       lat_1 = df_1$lat[i], lon_2 = df_2$lon[j], 
      lat_2 = df_2$lat[j], distance = tmp))
  }
  }
  • 输出
> dim(result)
[1] 600   5
> result2 %>% 
  filter(round(lat, 2) == 40.61, round(lon, 2) == -74.06, 
      round(lon_2, 2) == -74.08)
# A tibble: 1 × 5
    lon   lat lon_2 lat_2 distances
  <dbl> <dbl> <dbl> <dbl>     <dbl>
1 -74.1  40.6 -74.1  40.7     6520.
> head(result, 1)
      lon_1    lat_1     lon_2   lat_2 distance
1 -74.06205 40.60602 -74.07547 40.6637 6520.188
wwwo4jvm

wwwo4jvm2#

也许只是这样做:

library(dplyr)
cross_join(df_1_unique, df_2) %>%
  rowwise() %>%
  mutate(dist = distHaversine(c(lon.x, lat.x), c(lon.y, lat.y)))

输出:

lon.x lat.x lon.y lat.y   dist
   <dbl> <dbl> <dbl> <dbl>  <dbl>
 1 -74.1  40.6 -74.1  40.7  6520.
 2 -74.1  40.6 -74.0  40.5 14137.
 3 -74.1  40.6 -74.1  40.8 23837.
 4 -74.1  40.6 -73.8  40.6 23404.
 5 -74.1  40.6 -73.9  40.6 15528.
 6 -74.1  40.6 -74.1  40.8 23782.
 7 -74.1  40.6 -74.0  40.7  8818.
 8 -74.1  40.6 -74.1  40.6  1878.
 9 -74.1  40.6 -73.9  40.7 17929.
10 -74.1  40.6 -74.0  40.7 11098.
# … with 590 more rows
# ℹ Use `print(n = ...)` to see more rows

相关问题