main.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. from langchain_community.llms import Ollama
  2. from langchain_community.embeddings import OllamaEmbeddings
  3. from langchain.prompts import ChatPromptTemplate
  4. from langchain_core.output_parsers import StrOutputParser
  5. from langchain_community.document_loaders import BSHTMLLoader
  6. # from langchain_community.document_loaders import UnstructuredURLLoader
  7. from langchain.text_splitter import CharacterTextSplitter
  8. from langchain_community.vectorstores import Chroma
  9. from langchain.chains.combine_documents import create_stuff_documents_chain
  10. from langchain.chains import create_retrieval_chain
  11. MODEL = "mistral"
  12. prompt = ChatPromptTemplate.from_template(
  13. """
  14. You are a world class expert in vehicle appraisal.
  15. Answer only based on the following provided context. If you know the answer but it's not based in the
  16. provided context, don't provide the answer, just state the answer is not in the context provided:
  17. <context>
  18. {context}
  19. </context>
  20. User: {input}
  21. """
  22. )
  23. llm = Ollama(model=MODEL)
  24. raw_documents = BSHTMLLoader("../html/yapo_3.html").load()
  25. text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
  26. texts = text_splitter.split_documents(raw_documents)
  27. # loader = UnstructuredURLLoader(urls=[
  28. # "https://www.yapo.cl/vehiculos/peugeot-3008-16-hybrid4-e-auto8-gt-2023_89068823"
  29. # ])
  30. vector = Chroma.from_documents(texts, OllamaEmbeddings())
  31. retriever = vector.as_retriever()
  32. output_parser = StrOutputParser()
  33. document_chain = create_stuff_documents_chain(llm, prompt, output_parser=output_parser)
  34. retrieval_chain = create_retrieval_chain(retriever, document_chain)
  35. response = retrieval_chain.invoke({
  36. "input": """
  37. Estás viendo la publicación de un vehículo, búsca la siguiente información:
  38. año
  39. marca
  40. modelo
  41. precio
  42. kilometraje
  43. combustible
  44. transmisión
  45. main gallery photo url
  46. código publicación
  47. """
  48. })
  49. print(response["answer"])