Spaces:

TURKCELL
/

offensive-lang-detection-tr

Sleeping

App Files Files Community

offensive-lang-detection-tr / app.py

zeynepgulhan

app file created

79bbdf9 verified 8 months ago

raw

history blame contribute delete

No virus

3.1 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	import numpy as np
	import re

	from turkish.deasciifier import Deasciifier

	# Model ve tokenizer initialization
	tokenizer = AutoTokenizer.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
	model = AutoModelForSequenceClassification.from_pretrained("TURKCELL/bert-offensive-lang-detection-tr")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)


	def deasciifier(text):
	deasciifier = Deasciifier(text)
	return deasciifier.convert_to_turkish()


	def remove_circumflex(text):
	circumflex_map = {
	'â': 'a',
	'î': 'i',
	'û': 'u',
	'ô': 'o',
	'Â': 'A',
	'Î': 'I',
	'Û': 'U',
	'Ô': 'O'
	}

	return ''.join(circumflex_map.get(c, c) for c in text)


	def turkish_lower(text):
	turkish_map = {
	'I': 'ı',
	'İ': 'i',
	'Ç': 'ç',
	'Ş': 'ş',
	'Ğ': 'ğ',
	'Ü': 'ü',
	'Ö': 'ö'
	}
	return ''.join(turkish_map.get(c, c).lower() for c in text)


	def clean_text(text):
	# Metindeki şapkalı harfleri kaldırma
	text = remove_circumflex(text)
	# Metni küçük harfe dönüştürme
	text = turkish_lower(text)
	# deasciifier
	text = deasciifier(text)
	# Kullanıcı adlarını kaldırma
	text = re.sub(r"@\S*", " ", text)
	# Hashtag'leri kaldırma
	text = re.sub(r'#\S+', ' ', text)
	# URL'leri kaldırma
	text = re.sub(r"http\S+\|www\S+\|https\S+", ' ', text, flags=re.MULTILINE)
	# Noktalama işaretlerini ve metin tabanlı emojileri kaldırma
	text = re.sub(r'[^\w\s]\|(:\)\|:\(\|:D\|:P\|:o\|:O\|;\))', ' ', text)
	# Emojileri kaldırma
	emoji_pattern = re.compile("["
	u"\U0001F600-\U0001F64F" # emoticons
	u"\U0001F300-\U0001F5FF" # symbols & pictographs
	u"\U0001F680-\U0001F6FF" # transport & map symbols
	u"\U0001F1E0-\U0001F1FF" # flags (iOS)
	u"\U00002702-\U000027B0"
	u"\U000024C2-\U0001F251"
	"]+", flags=re.UNICODE)
	text = emoji_pattern.sub(r' ', text)

	# Birden fazla boşluğu tek boşlukla değiştirme
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	def is_offensive(sentence):
	normalize_text = clean_text(sentence)

	test_sample = tokenizer(normalize_text, padding=True, truncation=True, max_length=256, return_tensors='pt')
	test_sample = {k: v.to(device) for k, v in test_sample.items()}

	output = model(**test_sample)
	y_pred = np.argmax(output.logits.detach().cpu().numpy(), axis=1)

	d = {0: 'non-offensive', 1: 'offensive'}
	return d[y_pred[0]]


	iface = gr.Interface(
	fn=is_offensive,
	inputs=gr.Textbox(lines=2, placeholder="Enter sentence here..."),
	outputs="text",
	title="Offensive Language Detection",
	description="Offensive language detection for Turkish"
	)

	iface.launch()