feat(scripts): add json and parquet support to prompt book parser
This commit is contained in:
parent
818d222ff8
commit
2121c7aa5d
|
@ -1,8 +1,8 @@
|
||||||
from typing import List
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from sys import argv
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from json import dumps
|
from json import dumps, loads
|
||||||
|
from sys import argv
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
def parse_args(args: List[str]):
|
def parse_args(args: List[str]):
|
||||||
|
@ -14,13 +14,46 @@ def parse_args(args: List[str]):
|
||||||
return parser.parse_args(args)
|
return parser.parse_args(args)
|
||||||
|
|
||||||
|
|
||||||
|
def load_duck(file: str):
|
||||||
|
import duckdb
|
||||||
|
|
||||||
|
cursor = duckdb.connect()
|
||||||
|
return [p[0] for p in cursor.sql(f"SELECT * FROM '{file}'").fetchall()]
|
||||||
|
|
||||||
|
|
||||||
|
def load_json(file: str):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
data = loads(f.read())
|
||||||
|
params = data.get("params", None)
|
||||||
|
if params:
|
||||||
|
prompt = params.get("input_prompt", None)
|
||||||
|
if prompt:
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
prompt = params.get("prompt", None)
|
||||||
|
if prompt:
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def load_text(file: str):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
return f.readlines()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args(argv[1:])
|
args = parse_args(argv[1:])
|
||||||
|
|
||||||
lines: List[str] = []
|
lines: List[str] = []
|
||||||
for file in args.file:
|
for file in args.file:
|
||||||
with open(file, "r") as f:
|
if file.endswith(".parquet") or file.endswith(".duckdb"):
|
||||||
lines.extend(f.readlines())
|
lines.extend(load_duck(file))
|
||||||
|
elif file.endswith(".json"):
|
||||||
|
# json only contains a single prompt
|
||||||
|
lines.append(load_json(file))
|
||||||
|
else:
|
||||||
|
lines.extend(load_text(file))
|
||||||
|
|
||||||
phrases = []
|
phrases = []
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
|
Loading…
Reference in New Issue