From fe62ae4d5e9f5bb49d32323606a1312c381953c2 Mon Sep 17 00:00:00 2001 From: ageer <ageerle@163.com> Date: 星期日, 16 三月 2025 19:39:55 +0800 Subject: [PATCH] python脚本 --- script/docker/localModels/app.py | 116 ++++++++++++++++++++++++++++++++++++++ script/docker/localModels/requirements.txt | 3 + README.md | 2 script/docker/localModels/Dockerfile | 21 +++++++ 4 files changed, 141 insertions(+), 1 deletions(-) diff --git a/README.md b/README.md index bf66bf5..c520351 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ - [Naive UI](https://www.naiveui.com) - [RuoYi-Vue-Plus](https://gitee.com/dromara/RuoYi-Vue-Plus) -## Community +## 璐$尞鑰� <a href="https://github.com/ageerle/ruoyi-ai/graphs/contributors"> <img src="https://contrib.rocks/image?repo=ageerle/ruoyi-ai" /> </a> diff --git a/script/docker/localModels/Dockerfile b/script/docker/localModels/Dockerfile new file mode 100644 index 0000000..c988920 --- /dev/null +++ b/script/docker/localModels/Dockerfile @@ -0,0 +1,21 @@ +# 浣跨敤瀹樻柟 Python 浣滀负鍩虹闀滃儚 +FROM python:3.8-slim + +# 璁剧疆宸ヤ綔鐩綍涓� /app +WORKDIR /app + +# 澶嶅埗褰撳墠鐩綍涓嬬殑鎵�鏈夋枃浠跺埌 Docker 瀹瑰櫒鐨� /app 鐩綍 +COPY . /app + +# 瀹夎搴旂敤渚濊禆 +RUN pip install --no-cache-dir -r requirements.txt + +# 鏆撮湶 Flask 搴旂敤浣跨敤鐨勭鍙� +EXPOSE 5000 + +# 璁剧疆鐜鍙橀噺 +ENV FLASK_APP=app.py +ENV FLASK_RUN_HOST=0.0.0.0 + +# 鍚姩 Flask 搴旂敤 +CMD ["flask", "run", "--host=0.0.0.0"] diff --git a/script/docker/localModels/app.py b/script/docker/localModels/app.py new file mode 100644 index 0000000..645a9b4 --- /dev/null +++ b/script/docker/localModels/app.py @@ -0,0 +1,116 @@ +from flask import Flask, request, jsonify +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import json + +app = Flask(__name__) + +# 鍒涘缓涓�涓叏灞�鐨勬ā鍨嬬紦瀛樺瓧鍏� +model_cache = {} + +# 鍒嗗壊鏂囨湰鍧� +def split_text(text, block_size, overlap_chars, delimiter): + chunks = text.split(delimiter) + text_blocks = [] + current_block = "" + + for chunk in chunks: + if len(current_block) + len(chunk) + 1 <= block_size: + if current_block: + current_block += " " + chunk + else: + current_block = chunk + else: + text_blocks.append(current_block) + current_block = chunk + if current_block: + text_blocks.append(current_block) + + overlap_blocks = [] + for i in range(len(text_blocks)): + if i > 0: + overlap_block = text_blocks[i - 1][-overlap_chars:] + text_blocks[i] + overlap_blocks.append(overlap_block) + overlap_blocks.append(text_blocks[i]) + + return overlap_blocks + +# 鏂囨湰鍚戦噺鍖� +def vectorize_text_blocks(text_blocks, model): + return model.encode(text_blocks) + +# 鏂囨湰妫�绱� +def retrieve_top_k(query, knowledge_base, k, block_size, overlap_chars, delimiter, model): + # 灏嗙煡璇嗗簱鎷嗗垎涓烘枃鏈潡 + text_blocks = split_text(knowledge_base, block_size, overlap_chars, delimiter) + # 鍚戦噺鍖栨枃鏈潡 + knowledge_vectors = vectorize_text_blocks(text_blocks, model) + # 鍚戦噺鍖栨煡璇㈡枃鏈� + query_vector = model.encode([query]).reshape(1, -1) + # 璁$畻鐩镐技搴� + similarities = cosine_similarity(query_vector, knowledge_vectors) + # 鑾峰彇鐩镐技搴︽渶楂樼殑 k 涓枃鏈潡鐨勭储寮� + top_k_indices = similarities[0].argsort()[-k:][::-1] + + # 杩斿洖鏂囨湰鍧楀拰瀹冧滑鐨勫悜閲� + top_k_texts = [text_blocks[i] for i in top_k_indices] + top_k_embeddings = [knowledge_vectors[i] for i in top_k_indices] + + return top_k_texts, top_k_embeddings + +@app.route('/vectorize', methods=['POST']) +def vectorize_text(): + # 浠庤姹備腑鑾峰彇 JSON 鏁版嵁 + data = request.json + print(f"Received request data: {data}") # 璋冭瘯杈撳嚭璇锋眰鏁版嵁 + + text_list = data.get("text", []) + model_name = data.get("model_name", "msmarco-distilbert-base-tas-b") # 榛樿妯″瀷 + + delimiter = data.get("delimiter", "\n") # 榛樿鍒嗛殧绗� + k = int(data.get("k", 3)) # 榛樿妫�绱㈡潯鏁� + block_size = int(data.get("block_size", 500)) # 榛樿鏂囨湰鍧楀ぇ灏� + overlap_chars = int(data.get("overlap_chars", 50)) # 榛樿閲嶅彔瀛楃鏁� + + if not text_list: + return jsonify({"error": "Text is required."}), 400 + + # 妫�鏌ユā鍨嬫槸鍚﹀凡缁忓姞杞� + if model_name not in model_cache: + try: + model = SentenceTransformer(model_name) + model_cache[model_name] = model # 缂撳瓨妯″瀷 + except Exception as e: + return jsonify({"error": f"Failed to load model: {e}"}), 500 + + model = model_cache[model_name] + + top_k_texts_all = [] + top_k_embeddings_all = [] + + # 濡傛灉鍙湁涓�涓煡璇㈡枃鏈� + if len(text_list) == 1: + top_k_texts, top_k_embeddings = retrieve_top_k(text_list[0], text_list[0], k, block_size, overlap_chars, delimiter, model) + top_k_texts_all.append(top_k_texts) + top_k_embeddings_all.append(top_k_embeddings) + elif len(text_list) > 1: + # 濡傛灉澶氫釜鏌ヨ鏂囨湰锛屼緷娆″鐞� + for query in text_list: + top_k_texts, top_k_embeddings = retrieve_top_k(query, text_list[0], k, block_size, overlap_chars, delimiter, model) + top_k_texts_all.append(top_k_texts) + top_k_embeddings_all.append(top_k_embeddings) + + # 灏嗗祵鍏ュ悜閲忥紙ndarray锛夎浆鎹负鍙簭鍒楀寲鐨勫垪琛� + top_k_embeddings_all = [[embedding.tolist() for embedding in embeddings] for embeddings in top_k_embeddings_all] + + print(f"Top K texts: {top_k_texts_all}") # 鎵撳嵃妫�绱㈠埌鐨勬枃鏈� + print(f"Top K embeddings: {top_k_embeddings_all}") # 鎵撳嵃妫�绱㈠埌鐨勫悜閲� + + # 杩斿洖 JSON 鏍煎紡鐨勬暟鎹� + return jsonify({ + + "topKEmbeddings": top_k_embeddings_all # 杩斿洖宓屽叆鍚戦噺 + }) + +if __name__ == '__main__': + app.run(host="0.0.0.0", port=5000, debug=True) diff --git a/script/docker/localModels/requirements.txt b/script/docker/localModels/requirements.txt new file mode 100644 index 0000000..c1e1b50 --- /dev/null +++ b/script/docker/localModels/requirements.txt @@ -0,0 +1,3 @@ +Flask==2.0.3 +sentence-transformers==2.2.0 +scikit-learn==0.24.2 -- Gitblit v1.9.3