mirror of
https://github.com/jlengrand/tldw.git
synced 2026-03-10 08:51:17 +00:00
28 lines
1.5 KiB
Python
28 lines
1.5 KiB
Python
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from jinja2 import Template
|
|
import json
|
|
|
|
lines = open('ufo-clean-parts.txt').readlines()
|
|
clean_text = '\n'.join([x.strip() for x in lines])
|
|
|
|
# Split the input into overlapping chunks
|
|
chunk_size = 1024*4
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, length_function=len)
|
|
texts = text_splitter.split_text(clean_text)
|
|
print("Input text",len(clean_text),"characters, split into",len(texts),"chunks")
|
|
|
|
# Prompts and templates
|
|
system_message = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
|
|
|
|
simple_instr_v3 = "Please summarize the above transcript. Remove any text related to politeness, congress or politics."
|
|
simple_instr_v4 = "Please write a highly detailed summary of the above transcript. Remove any text related to politeness, congress or politics."
|
|
|
|
template = "{{system_message}} USER: {{prompt}}\n\n{{instr}} ASSISTANT: In this transcript, "
|
|
|
|
prepare = []
|
|
for idx, chunk in enumerate(texts):
|
|
prompt = Template(template).render(system_message=system_message,prompt=chunk,instr=simple_instr_v4)
|
|
item = { 'language': 'english', 'name': f'chunk-{idx}', 'prompt': prompt }
|
|
prepare.append(item)
|
|
|
|
open(f'prepare_ufo-chunk-{chunk_size}_english_airoboros-l2-v4.ndjson','w').write('\n'.join([json.dumps(x) for x in prepare])) |