tldw/airoboros-l2.py

from langchain.text_splitter import RecursiveCharacterTextSplitter
from jinja2 import Template
import json

lines = open('ufo-clean-parts.txt').readlines()
clean_text = '\n'.join([x.strip() for x in lines])

# Split the input into overlapping chunks
chunk_size = 1024*4
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, length_function=len)
texts = text_splitter.split_text(clean_text)
print("Input text",len(clean_text),"characters, split into",len(texts),"chunks")

# Prompts and templates
system_message = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."

simple_instr_v3 = "Please summarize the above transcript. Remove any text related to politeness, congress or politics."
simple_instr_v4 = "Please write a highly detailed summary of the above transcript. Remove any text related to politeness, congress or politics."

template = "{{system_message}} USER: {{prompt}}\n\n{{instr}} ASSISTANT: In this transcript, "

prepare = []
for idx, chunk in enumerate(texts):
    prompt = Template(template).render(system_message=system_message,prompt=chunk,instr=simple_instr_v4)
    item = { 'language': 'english', 'name': f'chunk-{idx}', 'prompt': prompt }
    prepare.append(item)

open(f'prepare_ufo-chunk-{chunk_size}_english_airoboros-l2-v4.ndjson','w').write('\n'.join([json.dumps(x) for x in prepare]))