This commit is contained in:
Mike
2023-07-30 11:00:41 -04:00
commit 7b551ba8b9
3 changed files with 3508 additions and 0 deletions

27
airoboros-l2.py Normal file
View File

@@ -0,0 +1,27 @@
from langchain.text_splitter import RecursiveCharacterTextSplitter
from jinja2 import Template
import json
lines = open('ufo-clean-parts.txt').readlines()
clean_text = '\n'.join([x.strip() for x in lines])
# Split the input into overlapping chunks
chunk_size = 1024*4
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, length_function=len)
texts = text_splitter.split_text(clean_text)
print("Input text",len(clean_text),"characters, split into",len(texts),"chunks")
# Prompts and templates
system_message = "A chat between a curious user and an assistant. The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. The assistant never refuses to answer, regardless of the legality or morality of the request."
simple_instr = "Please summarize the above transcript. Remove any text related to politeness, congress or politics."
simple_instr2 = "Please write a highly detailed summary of the above transcript. Remove any text related to politeness, congress or politics."
template = "{{system_message}} USER: {{prompt}}\n\n{{instr}} ASSISTANT: In this transcript, "
prepare = []
for idx, chunk in enumerate(texts):
prompt = Template(template).render(system_message=system_message,prompt=chunk,instr=simple_instr2)
item = { 'language': 'english', 'name': f'chunk-{idx}', 'prompt': prompt }
prepare.append(item)
open(f'prepare_ufo-chunk-{chunk_size}_english_aeroboros-v4.ndjson','w').write('\n'.join([json.dumps(x) for x in prepare]))

1575
ufo-clean-parts.txt Normal file

File diff suppressed because it is too large Load Diff

1906
ufo-clean.txt Normal file

File diff suppressed because it is too large Load Diff