File size: 8,112 Bytes
28f4a08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f564c9e
 
a262acc
f564c9e
 
 
 
28f4a08
 
 
a262acc
 
 
 
28f4a08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a262acc
 
28f4a08
a262acc
 
 
28f4a08
a262acc
 
28f4a08
 
 
f564c9e
28f4a08
 
 
 
 
f564c9e
 
 
 
 
 
28f4a08
f564c9e
 
 
 
 
 
28f4a08
 
 
 
 
 
f564c9e
 
28f4a08
 
 
 
 
 
 
f564c9e
 
 
 
 
 
 
 
 
 
 
28f4a08
 
f564c9e
 
 
 
 
 
 
 
28f4a08
f564c9e
 
 
28f4a08
 
 
 
37dbff3
40eb027
 
28f4a08
f564c9e
e1980f6
f564c9e
 
 
 
 
 
 
 
 
 
40eb027
7089c59
 
f564c9e
 
 
 
28f4a08
f564c9e
28f4a08
f564c9e
 
 
28f4a08
 
 
 
 
7089c59
28f4a08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f564c9e
40eb027
f564c9e
28f4a08
f564c9e
28f4a08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import streamlit as st
import spacy
from streamlit_echarts import st_echarts
from annotated_text import annotated_text

st.set_page_config(
     page_title="LeetSpeak-NER",
     page_icon=":mega:",
     layout="wide",
     initial_sidebar_state="expanded",
     menu_items={
         'Get Help': 'https://www.extremelycoolapp.com/help',
         'Report a bug': "https://www.extremelycoolapp.com/bug",
         'About': "# This is a header. This is an *extremely* cool app!"
     }
 )


@st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
def load_models():
    if selected_for == "Accuracy":
        spanish_model = spacy.load("./spacy-models/output_full_ES_roberta-base-bne/model-best")
        english_model = spacy.load("./spacy-models/output_full_EN_roberta_base/model-best") 
        
    if selected_for == "Efficiency":
        spanish_model = spacy.load("./spacy-models/toy_output_es_blank/model-best")
        english_model = spacy.load("./spacy-models/toy_output_en_blank/model-best/")     
    models = {"English": english_model, "Spanish": spanish_model}
    return models

@st.cache(show_spinner=True, allow_output_mutation=True, suppress_st_warning=True)
def load_xx_model():
    return  spacy.load("xx_LeetSpeakNER_mstsb_mpnet")
    

# 'INV_CAMO', 'LEETSPEAK', 'MIX', 'PUNCT_CAMO'
def process_text(doc, selected_multi_ner):
    tokens = []
    for token in doc:
        if selected_multi_ner == "Yes":
            if token.ent_type_ == "INV_CAMO":
                tokens.append((token.text, "INV_CAMO", "#faa"))
            elif token.ent_type_ == "LEETSPEAK":
                tokens.append((token.text, "LEETSPEAK", "#fda"))
            elif token.ent_type_ == "MIX":
                tokens.append((token.text, "MIX", "#afa")) 
            elif token.ent_type_ == "PUNCT_CAMO":
                tokens.append((token.text, "PUNCT_CAMO", "#aaaaff")) 
            else:
                tokens.append(" " + token.text + " ")
        else:
            if token.ent_type_ in ['INV_CAMO', 'LEETSPEAK', 'MIX', 'PUNCT_CAMO']:
                tokens.append((token.text, "CAMOUFLAGE", "#ffd5aa")) 
            else:
                tokens.append(" " + token.text + " ")                
            

    return tokens


# Side bar
# selected_language = st.sidebar.selectbox("Select a language", options=["English", "Spanish"])
selected_language = st.sidebar.selectbox("Select a language", options=["Multilingual"])
selected_multi_ner = st.sidebar.radio('Do you want to break down the Entities detected by type of leetspeak?', ['Yes', 'No'])
# selected_for = st.sidebar.radio('Select for:', ['Efficiency', 'Accuracy'])
# models = load_models()
# selected_model = models[selected_language]


selected_model = load_xx_model()

import base64

LOGO_IMAGE = "LeetSpeak-NER-cropped.png"

st.markdown(
    """
    <style>
    .logo-img {


        margin-top: auto;
        margin-left: 30%;
        width: 30%;
        
    }
    .logo-img-2 {
    margin-top: 10%;
      margin-left: 20%;
      width: 35%;        
        
    }   
    </style>
    """,
    unsafe_allow_html=True
)


col1, col2= st.columns([2, 2])
with col1:
    # st.image('./aida_logo.png')
     st.markdown(
        f"""
            <img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
        """,
        unsafe_allow_html=True
    )  
    
with col2:
    # st.image('./aida_logo.png')
     st.markdown(
        f"""
            <img class="logo-img-2" src="data:image/png;base64,{base64.b64encode(open("aida_logo.png", "rb").read()).decode()}">
        """,
        unsafe_allow_html=True
    )  

# st.image([LOGO_IMAGE,"aida_logo.png"], width=100)


st.markdown("""
    <style>
    .big-font {
        font-size:3em;
        font-weight: bold;
    }
    </style>
    """, unsafe_allow_html=True)

st.markdown('<p class="big-font">Welcome to <font color="#4B8BBE">Leet</font><font color=" #FFD43B">Speak</font><font color="#ff73a2">-NER</font></p>', unsafe_allow_html=True)
        
        
with st.expander("Project Description", expanded=False):
     st.write("""
         Developed in Applied Intelligence and Data Analysis ([AI+DA](http://aida.etsisi.upm.es/)) group at Polytech University of Madrid (UPM).
         
         This tool uses a Spacy-Transformer Name Entity Recognition model to detect the presence of words camouflaged. Word camouflage is currently used to evade content moderation in Social Media. Therefore, the aim of this tool is to counter new ways of misinformation that emerge in social media platforms. This version is a initial step focused on detecting word camouflage in news, reports or talks since the models have been fine-tuned over TED talks, OPUS News Commentaries and Wikipedia.
         
         Currently, two languages are supported: English and Spanish. Additionally, you can select whether the detected entities are broken down into the three types of camouflaged words: Canonical Leetspeak, Punctuation Camouflaged, Inversion Camouflaged. Finally, you can select between "accuracy" or "efficiency" performance. 
     """)  
        
with st.expander("Try any of these examples", expanded=False):
     st.write("""
    ENGLISH:
    - Desperately dominated by fam1ly sitüatløns, he leaves her.
    - You might as well come out to investigate a strang3 n'o?i+se or something.
    - But one other thing that we have to re;think is the way that we dy£ our #c!l.o|th?£+s.
    - And he wanted Baltimore to get that same kind of att£ntløn from the outside, but )i)n)t)r)o)s)p)e)c)t)i)o)n from the inside about what was going on with us.

     
    SPANISH
    - _d+i%o"s mío!
    - La C0v!d es un 3ng@ño de los G0b!3rno$
    - @#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)
    - se asocian con el m13;d0 y el d'o'lor. g£rønlmo solía decir
    - Con las nuevas tecnologías digitales, los agrlcultør£s pueden manejar mejor el uso de sus tierras, su energía y su agua, y prepararse para el mal clima.
    - En el tiempo transcurrido entre mi período de escuela %s%3%c%_%n%d%a%r%1%a y el mo'm3n'to de empezar a enseñar vimos surgir el fenómeno de in't£r'net
    - Las pre0c_pac1on3s van desde inquietudes por las ramificaciones desestabilizadoras de una estrategia de salida de la FC, hasta aprehensión por pérdidas de capital en la rápidamente creciente cartera de valores de la Fed (actualmente de $3 billones y en camino a los $4 billones para finales de este año).
     """)  
    
    #     - Why do all these _r_e_p_o_r_t_e_r_s, who get praise and money for doing what Assange has done, maintain a cow;ardly silence (at best) while a fellow publisher faces threats of extradition, banning, and espionage charges (which can incur the death penalty), not to mention calls for his as'sa'ss1nat'i'on?
    
    # - Cada uno de estos es un crimen de guerra, un crimen contra la humanidad y, en el caso de los asesinatos masivos de la campaña de Anfal, y tal vez también en el caso de los árabes de los pantanos, el crimen más serio de todos, ge'no'ci'dio.    
  
    # - No quiere decir que debamos iniciar una campaña por los derechos de los lns£ctøs
    

st.subheader("Input Text")   

with st.form("my_form"):
    text_input = st.text_area('Insert a text to detect leetspeak entities', 
                              # placeholder="@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)", 
                              # value="@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)"
                             )

    uploaded_file = st.file_uploader("or Upload a file", type=["doc", "docx", "pdf", "txt"])
    if uploaded_file is not None:
        text_input = uploaded_file.getvalue()
        text_input = text_input.decode("utf-8")
    
    # Every form must have a submit button.
    submitted = st.form_submit_button("Submit")
       
    

    
st.subheader("Output")
with st.spinner('Wait for it...'):
    doc = selected_model(text_input.lower())
    tokens = process_text(doc, selected_multi_ner)

    annotated_text(*tokens)