我想在Kaggle提供的GPU上运行我的代码。虽然我可以在CPU上运行我的代码,但我想无法将其正确迁移到Kaggle GPU上运行。
在运行此
with tf.device("/device:GPU:0"):
hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)
而得到这个错误
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-28-cdb8b009cd85> in <module>
1 with tf.device("/device:GPU:0"):
----> 2 hist = model.fit(x=X_train, y=Y_train, validation_data=(X_test, Y_test), batch_size=25, epochs=20, callbacks=callbacks_list)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
817 self._assert_compile_was_called()
818 self._check_call_args('evaluate')
--> 819
820 func = self._select_training_loop(x)
821 return func.evaluate(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
233
234 recreate_training_iterator = (
--> 235 training_data_adapter.should_recreate_iterator(steps_per_epoch))
236 if not steps_per_epoch:
237 # TODO(b/139762795): Add step inference for when steps is None to
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_training_inputs(model, x, y, batch_size, epochs, sample_weights, class_weights, steps_per_epoch, validation_split, validation_data, validation_steps, shuffle, distribution_strategy, max_queue_size, workers, use_multiprocessing)
591 class_weights=None,
592 shuffle=False,
--> 593 steps=None,
594 distribution_strategy=None,
595 max_queue_size=10,
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2.py in _process_inputs(model, mode, x, y, batch_size, epochs, sample_weights, class_weights, shuffle, steps, distribution_strategy, max_queue_size, workers, use_multiprocessing)
704 """Provide a scope for running one batch."""
705 batch_logs = {'batch': step, 'size': size}
--> 706 self.callbacks._call_batch_hook(
707 mode, 'begin', step, batch_logs)
708 self.progbar.on_batch_begin(step, batch_logs)
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, sample_weight_modes, batch_size, epochs, steps, shuffle, **kwargs)
355 sample_weights = _process_numpy_inputs(sample_weights)
356
--> 357 # If sample_weights are not specified for an output use 1.0 as weights.
358 if (sample_weights is not None and
359 any([sw is None for sw in sample_weights])):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/data_adapter.py in slice_inputs(self, indices_dataset, inputs)
381 if steps and not batch_size:
382 batch_size = int(math.ceil(num_samples/steps))
--> 383
384 if not batch_size:
385 raise ValueError(
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in from_tensors(tensors)
564 existing iterators.
565
--> 566 Args:
567 unused_dummy: Ignored value.
568
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/ops/dataset_ops.py in __init__(self, element)
2763 init_args: A nested structure representing the arguments to `init_func`.
2764 init_func: A TensorFlow function that will be called on `init_args` each
-> 2765 time a C++ iterator over this dataset is constructed. Returns a nested
2766 structure representing the "state" of the dataset.
2767 next_func: A TensorFlow function that will be called on the result of
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/data/util/structure.py in normalize_element(element)
111 ops.convert_to_tensor(t, name="component_%d" % i))
112 return nest.pack_sequence_as(element, normalized_components)
--> 113
114
115 def convert_legacy_structure(output_types, output_shapes, output_classes):
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py in convert_to_tensor(value, dtype, name, as_ref, preferred_dtype, dtype_hint, ctx, accepted_result_types)
1312 return ret
1313 raise TypeError("%sCannot convert %r with type %s to Tensor: "
-> 1314 "no conversion function registered." %
1315 (_error_prefix(name), value, type(value)))
1316
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/tensor_conversion_registry.py in _default_conversion_function(***failed resolving arguments***)
50 def _default_conversion_function(value, dtype, name, as_ref):
51 del as_ref # Unused.
---> 52 return constant_op.constant(value, dtype, name=name)
53
54
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in constant(value, dtype, shape, name)
256 return _eager_fill(shape.as_list(), t, ctx)
257 raise TypeError("Eager execution of tf.constant with unsupported shape "
--> 258 "(value has %d elements, shape is %s with %d elements)." %
259 (num_t, shape, shape.num_elements()))
260 g = ops.get_default_graph()
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in _constant_impl(value, dtype, shape, name, verify_shape, allow_broadcast)
264 value, dtype=dtype, shape=shape, verify_shape=verify_shape,
265 allow_broadcast=allow_broadcast))
--> 266 dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
267 const_tensor = g.create_op(
268 "Const", [], [dtype_value.type],
/opt/conda/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py in convert_to_eager_tensor(value, ctx, dtype)
94 dtype = dtypes.as_dtype(dtype).as_datatype_enum
95 ctx.ensure_initialized()
---> 96 return ops.EagerTensor(value, ctx.device_name, dtype)
97
98
RuntimeError: Can't copy Tensor with type string to device /job:localhost/replica:0/task:0/device:GPU:0.
我也尝试过安装不同的tensorflow版本,如最新的tensorflow,tensorflow-gpu,tensorflow-gpu=1.12,但没有成功。
虽然我可以通过使用以下代码列出CPU和GPUfrom tensorflow.python.client import device_lib print(device_lib.list_local_devices())
救命啊!
2条答案
按热度按时间s3fp2yjn1#
我终于让它工作了。在tensorflow中有一些未知的bug。它在tf-nightly build中工作正常。
wz3gfoph2#
当我像这样运行模型时,它对我来说工作得很好。