llama_index [问题]:羊驼指数流式评估

rur96b6h 于 4个月前发布在其他

关注(0)|答案(4)|浏览(49)

问题验证

我已经在文档和Discord上搜索了答案。

问题

当我在LlamaIndex的query_engine中使用以下代码片段时，即如下所示，输出列为空，记录Dataframe中的groundedness列丢失(使用tru-lens进行评估)。
without streaming=True,它运行正常。

import logging
import pickle
import sys
import os
import warnings
import trulens_eval
from dotenv import load_dotenv
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage, PromptTemplate
from llama_index.retrievers.bm25 import BM25Retriever
from custom_retriever import HybridRetriever
from utils import create_template
from trulens_eval.feedback.provider import OpenAI
from trulens_eval import Feedback, Tru
import numpy as np
from trulens_eval.app import App
from trulens_eval import TruLlama
from llama_index.llms.openai import OpenAI

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore', category=FutureWarning, message="The default dtype for empty Series will be 'object'")
PERSIST_DIR = "./storage"
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
tru = Tru()
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.4)
provider = trulens_eval.feedback.provider.OpenAI(openai_api_key)

# settings = {
#     "model": "gpt-3.5-turbo",
#     "temperature": 0.7,
#     "max_tokens": 500,
#     "top_p": 1,
#     "frequency_penalty": 0,
#     "presence_penalty": 0,
# }


with open("doc.pickle", "rb") as f:
    nodes = pickle.load(f)

if not os.path.exists(PERSIST_DIR):
    # load the documents and create the index
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    index = VectorStoreIndex(nodes, storage_context=storage_context)
    # store it for later
    index.storage_context.persist(persist_dir=PERSIST_DIR)
else:
    # load the existing index
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

# retrieve the top 10 most similar nodes using embeddings
vector_retriever = index.as_retriever(similarity_top_k=4)

# retrieve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=4)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

query_engine = RetrieverQueryEngine.from_args(
    retriever=hybrid_retriever,
    llm=llm,
   streaming=True
)

query_engine.update_prompts(
    {'response_synthesizer:text_qa_template': create_template()}
)

# response = query_engine.query("What is jas?")
# print(response)
#
context = App.select_context(query_engine)

# Define a groundedness feedback function
f_groundedness = (
    Feedback(provider.groundedness_measure_with_cot_reasons)
    .on(context.collect())  # collect context chunks into a list
    .on_output()
)

# Question/answer relevance between overall question and answer.
f_answer_relevance = (
    Feedback(provider.relevance)
    .on_input_output()
)
# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(provider.context_relevance_with_cot_reasons)
    .on_input()
    .on(context)
    .aggregate(np.mean)
)

tru_query_engine_recorder = TruLlama(query_engine,
    app_id='LlamaIndex_App1',
    feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])

with tru_query_engine_recorder as recording:
    response = query_engine.query("jas date")

rec = recording.records
records, feedback = tru.get_records_and_feedback(app_ids=["LlamaIndex_App1"])
tru.get_leaderboard(app_ids=["LlamaIndex_App1"])

records.head()

tru.run_dashboard()

llama_index

来源：https://github.com/run-llama/llama_index/issues/14508

4条答案

按热度按时间

x33g5p2x1#

要解决您在使用 streaming=True 功能时遇到的问题以及记录 DataFrame 中缺少的列，请参考以下步骤和注意事项：

确保流式传输的正确配置:
确保 RetrieverQueryEngine 已正确配置以处理流式传输响应。在构建 Response Synthesizer 时需要设置 streaming=True ,然后将其传递给 RetrieverQueryEngine 。

from llama_index.legacy.response_synthesizers import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine

synth = get_response_synthesizer(streaming=True, ...)
query_engine = RetrieverQueryEngine(response_synthesizer=synth, ...)

处理记录 DataFrame 中缺少的列:
TruLlama 类使用 get_eval_results_df 函数在记录 DataFrame 中聚合和显示反馈。如果像 "groundedness" 这样缺少列，请确保反馈函数已正确定义并应用。

from trulens_eval import Feedback, TruLlama

# Define feedback functions
f_groundedness = Feedback(provider.groundedness_measure_with_cot_reasons).on(context.collect()).on_output()
f_answer_relevance = Feedback(provider.relevance).on_input_output()
f_context_relevance = Feedback(provider.context_relevance_with_cot_reasons).on_input().on(context).aggregate(np.mean)

# Initialize TruLlama with feedbacks
tru_query_engine_recorder = TruLlama(query_engine, app_id='LlamaIndex_App1', feedbacks=[f_groundedness, f_answer_relevance, f_context_relevance])

关于缺少列的已知问题:
在 get_retrieval_results_df 函数中存在关于缺少列的已知问题。如果结果 DataFrame 中不存在指定的度量键，则函数会引发 ValueError 。请确保 DataFrame 中包含所有必要的度量键。

def get_retrieval_results_df(names, results_arr, metric_keys=None):
    metric_keys = metric_keys or DEFAULT_METRIC_KEYS
    avg_metrics_dict = defaultdict(list)
    for name, eval_results in zip(names, results_arr):
        metric_dicts = [eval_result.metric_vals_dict for eval_result in eval_results]
        results_df = pd.DataFrame(metric_dicts)
        for metric_key in metric_keys:
            if metric_key not in results_df.columns:
                raise ValueError(f"Metric key {metric_key} not in results_df")
            avg_metrics_dict[metric_key].append(results_df[metric_key].mean())
    return pd.DataFrame({"retrievers": names, **avg_metrics_dict})

通过确保流式传输的正确配置并解决关于缺少列的已知问题，您应该能够解决输出列为空以及记录 DataFrame 1(https://github.com/run-llama/llama_index/blob/main/docs/docs/module_guides/deploying/query_engine/streaming.md)2(https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/evaluation/notebook_utils.py) 中缺少 groundedness 列的问题。

要继续与 Dosu 对话，请提及 @dosu。*

赞(0）回复(0）举报 4个月前

bihw5rsg2#

我检查了所有方面，仍然面临相同的问题。如果流媒体为True, Jmeter 板输出始终为空。

赞(0）回复(0）举报 4个月前

jtoj6r0c3#

To resolve the issue with the output being None when streaming=True in the query_engine , you need to ensure that the StreamingResponse object is properly handled and processed. The StreamingResponse object requires you to iterate over the tokens as they arrive, which might not be directly compatible with how TruLens expects the data.
Here is an example of how you can handle the StreamingResponse to ensure that the output is properly captured and processed:

# Assuming you have already set up the query engine with streaming enabled
query_engine = index.as_query_engine(streaming=True, similarity_top_k=1)

# Perform the query
streaming_response = query_engine.query("What did the author do growing up?")

# Initialize an empty string to collect the response
full_response = ""

# Iterate over the streaming response to collect the full output
for text in streaming_response.response_gen:
    full_response += text

# Now you can use the full_response as needed
print(full_response)

# Ensure that the full_response is passed to TruLens for evaluation
# Example: records_df = trulens.evaluate(full_response)

Additionally, ensure that the CondenseQuestionChatEngine is properly configured to handle streaming responses. The stream_chat method in the CondenseQuestionChatEngine class is designed to handle streaming responses and record them correctly:

@trace_method("chat")
def stream_chat(
    self, message: str, chat_history: Optional[List[ChatMessage]] = None
) -> StreamingAgentChatResponse:
    chat_history = chat_history or self._memory.get(input=message)

    # Generate standalone question from conversation context and last message
    condensed_question = self._condense_question(chat_history, message)

    log_str = f"Querying with: {condensed_question}"
    logger.info(log_str)
    if self._verbose:
        print(log_str)

    # Query with standalone question
    query_response = self._query_engine.query(condensed_question)

    tool_output = self._get_tool_output_from_response(
        condensed_question, query_response
    )

    # Record response
    if (
        isinstance(query_response, StreamingResponse)
        and query_response.response_gen is not None
    ):
        # override the generator to include writing to chat history
        self._memory.put(ChatMessage(role=MessageRole.USER, content=message))
        response = StreamingAgentChatResponse(
            chat_stream=response_gen_from_query_engine(query_response.response_gen),
            sources=[tool_output],
        )
        thread = Thread(
            target=response.write_response_to_history,
            args=(self._memory, True),
        )
        thread.start()
    else:
        raise ValueError("Streaming is not enabled. Please use chat() instead.")
    return response

By collecting the full response from the StreamingResponse object and ensuring that the CondenseQuestionChatEngine is properly configured to handle streaming responses, you can ensure that the output is not empty and can be properly evaluated by TruLens. This should resolve the issue with the output column being empty and the groundedness column missing 1(https://github.com/run-llama/llama_index/blob/main/docs/docs/module_guides/deploying/query_engine/streaming.md)2(https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/chat_engine/condense_question.py)3(https://github.com/run-llama/llama_index/blob/main/llama-index-legacy/llama_index/legacy/chat_engine/condense_question.py).

To continue talking to Dosu , mention @dosu.*

赞(0）回复(0）举报 4个月前

mqkwyuun4#

仍然无法工作，出现相同的问题。

赞(0）回复(0）举报 4个月前

我来回答

llama_index [问题]:羊驼指数流式评估

问题验证

问题

4条答案

相关问题

热门标签

最新问答