我想训练一个模型,但面临一个问题:在每一批中,我会选择一些节点进行训练,节点的id范围从0到999。在训练时,我发现代码使用节点的id作为索引,并试图获取节点的其他数据,因此索引超出范围。我的代码在这里
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import train_test_split_edges
from torch_geometric.data import DataLoader as loader
import pandas as pd
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
# feature
class GCN(nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels):
super(GCN, self).__init__()
self.conv1 = GCNConv(in_channels, hidden_channels)
self.conv2 = GCNConv(hidden_channels, out_channels)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, p=0.5, training=self.training)
x = self.conv2(x, edge_index)
return x
class MyDataset(Dataset):
def __init__(self, nodes_file, edges_file, communities_file, features_file):
self.nodes_data = pd.read_csv(nodes_file)
self.edges_data = pd.read_csv(edges_file)
self.communities_data = pd.read_csv(communities_file)
self.features_data = pd.read_csv(features_file)
def __len__(self):
return len(self.nodes_data)
def __getitem__(self, idx):
node_id = self.nodes_data.iloc[idx]['ID']
node_category = self.nodes_data.iloc[idx]['Category']
node_community = self.communities_data[self.communities_data['ID'] == node_id]['Community'].values[0]
edge_source = self.nodes_data.iloc[idx]['ID']
edge_targets = eval(self.edges_data.iloc[idx]['neighbor'])
# Get the corresponding features for the current node_id
node_features = self.features_data[self.features_data['NodeID'] == node_id].iloc[:, 1:].values
node_features = torch.tensor(node_features, dtype=torch.float)
# You can process the node, edge, and community data as per your requirements
# and return them as tensors
return node_id, node_category, node_community, edge_source, edge_targets, node_features
def custom_collate_fn(batch):
node_ids, node_categories, node_communities, edge_sources, edge_targets, node_features = zip(*batch)
# Convert to PyTorch tensors
node_ids = torch.tensor(node_ids, dtype=torch.long)
node_categories = torch.tensor(node_categories, dtype=torch.long)
node_communities = torch.tensor(node_communities, dtype=torch.long)
edge_targets = [torch.tensor(targets) for targets in edge_targets]
# Replicate edge_sources to have the same length as edge_targets
edge_sources_replicated = [torch.tensor([source] * len(targets), dtype=torch.long) for source, targets in zip(edge_sources, edge_targets)]
return node_ids, node_categories, node_communities, edge_sources_replicated, edge_targets, node_features
nodes_file = 'nodes.csv'
edges_file = 'RWR_features.csv'
communities_file = 'community.csv'
feature_file = 'node_features.csv'
dataset = MyDataset(nodes_file, edges_file, communities_file, feature_file)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=custom_collate_fn)
input_dim = 1000 # Replace with the actual input dimension
hidden_dim = 64
output_dim = 32
num_categories = 3 # Replace with the actual number of node categories
num_communities = 5 # Replace with the actual number of communities
num_epochs = 300
# GCN
gcn_model = GCN(input_dim, hidden_dim, output_dim)
# classify
...
# train loop
for epoch in range(num_epochs):
for batch_data in dataloader:
node_ids, node_categories, node_communities, edge_sources_list, edge_targets_list, node_feature = batch_data
# print('node_ids:', node_ids)
# print('len:', len(node_ids))
# print('node_categories:', node_categories)
# print('len:', len(node_categories))
# print('node_communities:', node_communities)
# print('len:', len(node_communities))
# print('edge_sources:', edge_sources)
# print('len:', len(edge_sources))
# print('edge_targets', edge_targets)
# print('len:', len(edge_targets))
node_feature = torch.cat(node_feature, dim=0)
# print('node_feature', node_feature)
# print('len:', len(node_feature))
edge_sources = torch.cat(edge_sources_list, dim=0)
edge_targets = torch.cat(edge_targets_list, dim=0)
# print(edge_sources)
# print(len(edge_sources))
# print(edge_targets)
# print(len(edge_targets))
edge_index = torch.stack([edge_sources, edge_targets], dim=0)
# print(edge_index)
x = gcn_model(node_feature, edge_index)
字符串
我该如何解决此问题?一开始我想通过节点的ID批量找到他的索引,然后用这个索引找到我需要的其余数据,但是不知道怎么实现,这种方法可行吗?谢谢你,我很好
错误信息如下:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[2], line 62
57 edge_index = torch.stack([edge_sources, edge_targets], dim=0)
59 # print(edge_index)
---> 62 x = gcn_model(node_feature, edge_index)
64 # A
65 classify_logits = classify_model(x)
File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
Cell In[1], line 21, in GCN.forward(self, x, edge_index)
20 def forward(self, x, edge_index):
---> 21 x = self.conv1(x, edge_index)
22 x = F.relu(x)
23 x = F.dropout(x, p=0.5, training=self.training)
File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/nn/conv/gcn_conv.py:210, in GCNConv.forward(self, x, edge_index, edge_weight)
208 cache = self._cached_edge_index
209 if cache is None:
--> 210 edge_index, edge_weight = gcn_norm( # yapf: disable
211 edge_index, edge_weight, x.size(self.node_dim),
212 self.improved, self.add_self_loops, self.flow, x.dtype)
213 if self.cached:
214 self._cached_edge_index = (edge_index, edge_weight)
File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/nn/conv/gcn_conv.py:100, in gcn_norm(edge_index, edge_weight, num_nodes, improved, add_self_loops, flow, dtype)
98 row, col = edge_index[0], edge_index[1]
99 idx = col if flow == 'source_to_target' else row
--> 100 deg = scatter(edge_weight, idx, dim=0, dim_size=num_nodes, reduce='sum')
101 deg_inv_sqrt = deg.pow_(-0.5)
102 deg_inv_sqrt.masked_fill_(deg_inv_sqrt == float('inf'), 0)
File ~/miniconda3/lib/python3.8/site-packages/torch_geometric/utils/scatter.py:74, in scatter(src, index, dim, dim_size, reduce)
72 if reduce == 'sum' or reduce == 'add':
73 index = broadcast(index, src, dim)
---> 74 return src.new_zeros(size).scatter_add_(dim, index, src)
76 if reduce == 'mean':
77 count = src.new_zeros(dim_size)
RuntimeError: index 332 is out of bounds for dimension 0 with size 64
型
数据示例如下:
节点.csv(总计:1000个节点):enter image description here
RWR_features.csv(每个节点有30个重要的邻居,从边缘文件中采样,类是源节点的类):enter image description here的
community.csv:enter image description here的
node_features.csv:enter image description here的
07/30/2023编辑
已经修好了。问题出在GCN的输入上,我重写了代码。
import pandas as pd
import numpy as np
def load_data():
# Load nodes.csv
nodes_df = pd.read_csv('nodes.csv')
# Load community.csv
community_df = pd.read_csv('community.csv')
# Load node_features.csv
node_features_df = pd.read_csv('node_features.csv')
# Load RWR_features.csv
rwr_features_df = pd.read_csv('RWR_features.csv', usecols=['node', 'neighbor'])
rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(eval)
rwr_features_df['neighbor'] = rwr_features_df['neighbor'].apply(lambda x: [int(i) for i in x])
# Create mapping from node ID to index
node_to_index = {node_id: index for index, node_id in enumerate(nodes_df['ID'])}
# Create adjacency matrix
num_nodes = len(nodes_df)
adj = np.zeros((num_nodes, num_nodes))
for _, row in rwr_features_df.iterrows():
node_id = row['node']
neighbor_ids = row['neighbor']
for neighbor_id in neighbor_ids:
adj[node_to_index[node_id]][node_to_index[neighbor_id]] = 1
adj[node_to_index[neighbor_id]][node_to_index[node_id]] = 1 # Assuming undirected graph
# Create feature matrix
feature_columns = node_features_df.columns[1:] # Exclude the first column (ID)
feature = node_features_df[feature_columns].values
# Create class labels
class_label = nodes_df['Category'].values
# Create community labels
com_label = community_df['Community'].values
# Split indices for train, val, and test sets (You can modify this split ratio as needed)
num_train = int(num_nodes * 0.6)
num_val = int(num_nodes * 0.2)
num_test = num_nodes - num_train - num_val
# Generate random indices for train, val, and test sets
indices = np.random.permutation(num_nodes)
train_idx = indices[:num_train]
val_idx = indices[num_train:num_train + num_val]
test_idx = indices[num_train + num_val:]
return adj, feature, class_label, com_label, train_idx, val_idx, test_idx
# Usage
adj, feature, class_label, com_label, train_idx, val_idx, test_idx = load_data()
class MyDataset(Dataset):
def __init__(self, adj, feature, class_label, com_label):
self.adj = adj
self.feature = feature
self.class_label = class_label
self.com_label = com_label
def __len__(self):
return len(self.class_label)
def __getitem__(self, idx):
return idx, self.adj[idx] , self.feature[idx], self.class_label[idx], self.com_label[idx]
型
然后,我将邻接矩阵和特征矩阵输入到GCN中,而不是edge_index。
gcn_feature = gcn_model(batch_adj, batch_feature)
型
1条答案
按热度按时间dnph8jn41#
已经修好了。感谢所有读者的时间。问题在于GCN的输入,我需要将邻接矩阵和特征矩阵输入到GCN中,而不是edge_index。首先我重写了load_data部分:
字符串
需要返回idx来切割特征矩阵。然后,输入数据。
型