MiniMed_EHR_Analyst

Sleeping

File size: 2,697 Bytes

fc5ee8e
 
130318f
 
80a2449
fc5ee8e
 
 
130318f
ba57ab6
130318f
 
bcb7f51
130318f
ba57ab6
 
 
fc5ee8e
dc822cd
 
fc5ee8e
 
 
 
 
 
dc822cd
fc5ee8e
 
dc822cd
 
 
 
fc5ee8e
 
 
80a2449
fc5ee8e
80a2449
fc5ee8e
 
 
dc822cd
 
80a2449
fc5ee8e
 
 
 
 
 
 
 
 
dc822cd
 
80a2449
 
 
 
 
dc822cd
80a2449

import streamlit as st
import pandas as pd
from transformers import pipeline, AutoConfig, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, MistralForCausalLM
from peft import PeftModel, PeftConfig
import gradio as gr

#Note this should be used always in compliance with applicable laws and regulations if used with real patient data.

# Instantiate the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

# Load the PEFT model
peft_config = PeftConfig.from_pretrained("pseudolab/K23_MiniMed")
peft_model = MistralForCausalLM.from_pretrained("pseudolab/K23_MiniMed", trust_remote_code=True)
peft_model = PeftModel.from_pretrained(peft_model, "pseudolab/K23_MiniMed")

text_generator = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)

# Prepare the context
def prepare_context(data):
    # Format the data as a string
    data_str = data.to_string(index=False, header=False)

    # Tokenize the data
    # input_ids = tokenizer.encode(data_str, return_tensors="pt")

    # Truncate the input if it's too long for the model
    # max_length = tokenizer.model_max_length
    # if input_ids.shape[1] > max_length:
    #     input_ids = input_ids[:, :max_length]
    input_ids = data_str

    return input_ids

def fn(uploaded_file) -> str:
    data = pd.read_csv(uploaded_file)
    ret = ""

    # Generate text based on the context
    context = prepare_context(data)
    # generated_text = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)(context)[0]['generated_text']
    generated_text = text_generator(context)[0]['generated_text']
    ret += generated_text

    # Internally prompt the model to data analyze the EHR patient data
    prompt = "You are an Electronic Health Records analyst with nursing school training. Please analyze patient data that you are provided here. Give an organized, step-by-step, formatted health records analysis. You will always be truthful and if you do nont know the answer say you do not know."

    if prompt:
        # Tokenize the prompt
        input_ids = tokenizer.encode(prompt, return_tensors="pt")

        # Generate text based on the prompt
        # generated_text = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)(input_ids=input_ids)[0]['generated_text']
        generated_text = text_generator(prompt)[0]['generated_text']
        ret += generated_text

    return ret


demo = gr.Interface(fn=fn, inputs="file", outputs="text", theme="pseudolab/huggingface-korea-theme")


if __name__ == "__main__":
    demo.launch(show_api=False)