1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
| from pymilvus import ( connections, FieldSchema, CollectionSchema, DataType, Collection, utility )
connections.connect(host="localhost", port="19530")
fields = [ FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768), FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535), FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=256), ]
schema = CollectionSchema(fields=fields, description="AI知识库") collection = Collection(name="ai_knowledge", schema=schema)
index_params = { "metric_type": "COSINE", "index_type": "IVF_FLAT", "params": {"nlist": 1024} } collection.create_index(field_name="embedding", index_params=index_params)
def insert_data(texts, sources, model_name='all-mpnet-base-v2'): """插入文档数据""" encoder = SentenceTransformer(model_name) embeddings = encoder.encode(texts).tolist() entities = [ embeddings, texts, sources ] collection.insert(entities) collection.flush() print(f"已插入 {len(texts)} 条数据")
def milvus_search(query_text, top_k=10): """在Milvus中搜索""" encoder = SentenceTransformer('all-mpnet-base-v2') query_vector = encoder.encode([query_text]).tolist() collection.load() search_params = {"metric_type": "COSINE", "params": {"nprobe": 16}} results = collection.search( data=query_vector, anns_field="embedding", param=search_params, limit=top_k, output_fields=["text", "source"] ) for hits in results: for hit in hits: print(f"相似度: {hit.distance:.4f}") print(f"来源: {hit.entity.get('source')}") print(f"文本: {hit.entity.get('text')[:100]}...") print("---")
hnsw_params = { "metric_type": "COSINE", "index_type": "HNSW", "params": {"M": 16, "efConstruction": 256} } collection.create_index(field_name="embedding", index_params=hnsw_params)
|