1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
| from langchain.document_loaders import ( PyPDFLoader, UnstructuredHTMLLoader, CSVLoader, JSONLoader, Docx2txtLoader ) from langchain.text_splitter import ( RecursiveCharacterTextSplitter, MarkdownTextSplitter, PythonCodeTextSplitter ) from langchain.schema import Document
class DocumentPipeline: """ 文档处理流水线 """ def __init__(self, chunk_size=500, chunk_overlap=50): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.text_splitters = { "recursive": RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["\n\n", "\n", "。", "!", "?", " ", ""] ), "markdown": MarkdownTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ), "code": PythonCodeTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) } def load_pdf(self, file_path: str) -> list[Document]: """加载PDF""" loader = PyPDFLoader(file_path) pages = loader.load_and_split() return pages def load_html(self, file_path: str) -> list[Document]: """加载HTML""" loader = UnstructuredHTMLLoader(file_path) docs = loader.load() return docs def load_docx(self, file_path: str) -> list[Document]: """加载Word文档""" loader = Docx2txtLoader(file_path) docs = loader.load() return docs def load_json(self, file_path: str, jq_schema: str = ".[]") -> list[Document]: """加载JSON""" loader = JSONLoader(file_path, jq_schema=jq_schema) docs = loader.load() return docs def split_documents(self, documents: list[Document], splitter_type: str = "recursive") -> list[Document]: """ 分割文档 """ splitter = self.text_splitters.get(splitter_type, self.text_splitters["recursive"]) return splitter.split_documents(documents) def process_url(self, url: str) -> list[Document]: """从URL加载内容""" from langchain.document_loaders import WebBaseLoader loader = WebBaseLoader(url) docs = loader.load() return self.split_documents(docs)
|