File size: 2,697 Bytes
fc5ee8e
 
130318f
 
80a2449
fc5ee8e
 
 
130318f
ba57ab6
130318f
 
bcb7f51
130318f
ba57ab6
 
 
fc5ee8e
dc822cd
 
fc5ee8e
 
 
 
 
 
dc822cd
fc5ee8e
 
dc822cd
 
 
 
fc5ee8e
 
 
80a2449
fc5ee8e
80a2449
fc5ee8e
 
 
dc822cd
 
80a2449
fc5ee8e
 
 
 
 
 
 
 
 
dc822cd
 
80a2449
 
 
 
 
dc822cd
80a2449
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
import pandas as pd
from transformers import pipeline, AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, MistralForCausalLM
from peft import PeftModel, PeftConfig
import gradio as gr

#Note this should be used always in compliance with applicable laws and regulations if used with real patient data.

# Instantiate the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Load the PEFT model
peft_config = PeftConfig.from_pretrained("pseudolab/K23_MiniMed")
peft_model = MistralForCausalLM.from_pretrained("pseudolab/K23_MiniMed", trust_remote_code=True)
peft_model = PeftModel.from_pretrained(peft_model, "pseudolab/K23_MiniMed")

text_generator = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)

# Prepare the context
def prepare_context(data):
    # Format the data as a string
    data_str = data.to_string(index=False, header=False)

    # Tokenize the data
    # input_ids = tokenizer.encode(data_str, return_tensors="pt")

    # Truncate the input if it's too long for the model
    # max_length = tokenizer.model_max_length
    # if input_ids.shape[1] > max_length:
    #     input_ids = input_ids[:, :max_length]
    input_ids = data_str

    return input_ids

def fn(uploaded_file) -> str:
    data = pd.read_csv(uploaded_file)
    ret = ""

    # Generate text based on the context
    context = prepare_context(data)
    # generated_text = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)(context)[0]['generated_text']
    generated_text = text_generator(context)[0]['generated_text']
    ret += generated_text

    # Internally prompt the model to data analyze the EHR patient data
    prompt = "You are an Electronic Health Records analyst with nursing school training. Please analyze patient data that you are provided here. Give an organized, step-by-step, formatted health records analysis. You will always be truthful and if you do nont know the answer say you do not know."

    if prompt:
        # Tokenize the prompt
        input_ids = tokenizer.encode(prompt, return_tensors="pt")

        # Generate text based on the prompt
        # generated_text = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)(input_ids=input_ids)[0]['generated_text']
        generated_text = text_generator(prompt)[0]['generated_text']
        ret += generated_text

    return ret


demo = gr.Interface(fn=fn, inputs="file", outputs="text", theme="pseudolab/huggingface-korea-theme")


if __name__ == "__main__":
    demo.launch(show_api=False)