嗨,我已经在我的EKS集群中使用OpenTelemetry设置了Grafana克里思。在最初的时间克里思显示痕迹,但1小时后没有痕迹显示在grafana上,它显示我404 Not Found
这是我的克里思配置
apiVersion: v1
data:
overrides.yaml: |
overrides:
ingestion_rate_limit_bytes: 400000
max_bytes_per_trace: 0
max_search_bytes_per_trace: 100000000
tempo.yaml: |
auth_enabled: false
compactor:
compaction:
compacted_block_retention: 24h
compaction_window: 1h
block_retention: 1h
distributor:
receivers:
jaeger:
protocols:
thrift_compact:
endpoint: 0.0.0.0:6831
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:55680
ingester:
lifecycler:
ring:
replication_factor: 1
trace_idle_period: 1s
max_block_duration: 10m
server:
http_listen_port: 3200
storage:
trace:
backend: local
block:
bloom_filter_false_positive: .05
blocklist_poll: 30s
local:
path: /tmp/tempo/traces
wal:
path: /var/tempo/wal
pool:
max_workers: 1000
queue_depth: 200000
overrides:
ingestion_rate_limit_bytes: 400000
max_bytes_per_trace: 0
max_search_bytes_per_trace: 100000000
kind: ConfigMap
metadata:
name: tempo
namespace: monitoring
---
apiVersion: v1
kind: Service
metadata:
labels:
name: tempo
name: tempo
namespace: monitoring
spec:
ports:
- name: tempo-prom-metrics
port: 3200
targetPort: 3200
- name: tempo-otlp
port: 55680
protocol: TCP
targetPort: 55680
- name: http
port: 80
targetPort: 3200
- name: receiver
port: 6831
protocol: UDP
targetPort: 6831
selector:
app: tempo
name: tempo
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: tempo
namespace: monitoring
spec:
minReadySeconds: 10
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app: tempo
name: tempo
template:
metadata:
annotations:
config_hash: 7f4b5fad0e6364b6a2a5ea380281cb0e
labels:
app: tempo
name: tempo
spec:
containers:
- args:
- -config.file=/conf/tempo.yaml
- -mem-ballast-size-mbs=1024
env:
- name: JAEGER_AGENT_PORT
value: ""
image: grafana/tempo:main-e6394c3
imagePullPolicy: IfNotPresent
name: tempo
ports:
- containerPort: 3200
name: prom-metrics
- containerPort: 55680
name: otlp
protocol: TCP
volumeMounts:
- mountPath: /conf
name: tempo-conf
volumes:
- configMap:
name: tempo
name: tempo-conf
OTEL组态
apiVersion: v1
kind: ConfigMap
metadata:
name: collector-config
namespace: prod-aro-eks-clone
labels:
app: opentelemetry
component: otel-collector-conf
data:
collector.yaml: |
receivers:
# Make sure to add the otlp receiver.
# This will open up the receiver on port 4317
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:5555"
http:
hostmetrics:
collection_interval: 20s
scrapers:
cpu:
metrics:
system.cpu.utilization:
enabled: true
load:
memory:
metrics:
system.memory.utilization:
enabled: true
disk:
filesystem:
metrics:
system.filesystem.utilization:
enabled: true
network:
paging:
metrics:
system.paging.utilization:
enabled: true
processes:
process:
k8s_cluster:
collection_interval: 10s
node_conditions_to_report: [Ready, MemoryPressure,DiskPressure,NetworkUnavailable]
allocatable_types_to_report: [cpu, memory,storage]
k8s_events:
auth_type : serviceAccount
receiver_creator:
watch_observers: [k8s_observer]
receivers:
kubeletstats:
rule: type == "k8s.node"
config:
collection_interval: 10s
auth_type: serviceAccount
endpoint: "`endpoint`:`kubelet_endpoint_port`"
insecure_skip_verify: true
extra_metadata_labels:
- container.id
- k8s.volume.type
metric_groups:
- node
- pod
- volume
- container
prometheus:
config:
scrape_configs:
- job_name: 'kube-state-metrics'
scrape_interval: 5s
scrape_timeout: 1s
static_configs:
- targets: ['kube-prometheus-stack-kube-state-metrics.monitoring.svc.cluster.local:8080']
- job_name: k8s
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: ['__meta_kubernetes_pod_annotation_prometheus_io_scrape']
regex: "true"
action: keep
metric_relabel_configs:
- source_labels: [__name__]
regex: "(request_duration_seconds.*|response_duration_seconds.*)"
action: keep
processors:
memory_limiter:
check_interval: 1s
limit_mib: 2000
spike_limit_mib: 500
batch:
timeout: 10s
send_batch_size: 10000
spanmetrics:
metrics_exporter: prometheus
latency_histogram_buckets: [100ms, 250ms,500ms,1s,2s,4s,6s,8s,10s,20s,30s]
dimensions:
- name: http.method
- name: http.status_code
- name: db.operation
- name: db.statement
- name: exception.message
- name: exception.type
- name: messaging.message.id
- name: messaging.message.payload_size_bytes
dimensions_cache_size: 10000
aggregation_temporality: "AGGREGATION_TEMPORALITY_CUMULATIVE"
servicegraph:
metrics_exporter: prometheus
transform:
metric_statements:
- context: metric
statements:
- set(description, "Measures the duration of inbound HTTP requests") where name == "http.server.duration"
cumulativetodelta:
include:
metrics:
- system.network.io
- system.disk.operations
- system.network.dropped
- system.network.packets
- process.cpu.time
match_type: strict
resource:
attributes:
- key: host.id
from_attribute: host.name
action: upsert
resourcedetection:
detectors: [env, system]
k8sattributes:
auth_type: serviceAccount
passthrough: false
filter:
node_from_env_var: K8S_NODE_NAME
extract:
metadata:
- k8s.pod.name
- k8s.pod.uid
- k8s.deployment.name
- k8s.namespace.name
- k8s.node.name
- k8s.pod.start_time
metricstransform:
transforms:
include: .+
match_type: regexp
action: update
operations:
- action: add_label
new_label: kubernetes.cluster.id
new_value: CLUSTER_ID_TO_REPLACE
- action: add_label
new_label: kubernetes.name
new_value: prod-aro
extensions:
health_check: {}
exporters:
otlp:
endpoint: "http://tempo.monitoring.svc.cluster.local:55680"
tls:
insecure: true
prometheus:
endpoint: "0.0.0.0:6666"
logging:
loglevel: info
loki:
endpoint: http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
labels:
resource:
container.name: "container_name"
k8s.cluster.name: "k8s_cluster_name"
k8s.event.reason: "k8s_event_reason"
k8s.object.kind: "k8s_object_kind"
k8s.object.name: "k8s_object_name"
k8s.object.uid: "k8s_object_uid"
k8s.object.fieldpath: "k8s_object_fieldpath"
k8s.object.api_version: "k8s_object_api_version"
attributes:
k8s.event.reason: "k8s_event_reason"
k8s.event.action: "k8s_event_action"
k8s.event.start_time: "k8s_event_start_time"
k8s.event.name: "k8s_event_name"
k8s.event.uid: "k8s_event_uid"
k8s.namespace.name: "k8s_namespace_name"
k8s.event.count: "k8s_event_count"
record:
traceID: "traceid"
service:
extensions: [health_check]
pipelines:
logs:
receivers: [k8s_events]
processors: [memory_limiter,k8sattributes,batch]
exporters: [loki,logging]
traces:
receivers: [otlp]
processors: [spanmetrics,servicegraph,batch]
exporters: [otlp]
metrics:
receivers: [otlp,prometheus]
processors: [memory_limiter,metricstransform,k8sattributes,resourcedetection,batch]
exporters: [logging,prometheus,otlp]
telemetry:
logs:
level: info
initial_fields:
service: my-prom-instance
我想在这里强调一点,在OTEL pod tempo重新启动后,再次显示grafana上的痕迹。似乎收集器在一段时间后停止收集痕迹。因此,在重新启动OTEL服务之后,它再次开始获取轨迹。
克里思要有长时间的痕迹。
1条答案
按热度按时间hpcdzsge1#
您需要更新
block_retention
配置。在克里思.yaml配置中,您当前配置了
block_retention: 1h
,这就是为什么您不能查询超过1小时的跟踪。从Grafana克里思的文档中,
可选。保留块的持续时间。默认值为14天(336小时)。