Spaces:

Fred808
/

YT-Trainer

Paused

App Files Files Community

YT-Trainer / app.py

Fred808

Update app.py

97dbd8d verified 5 months ago

raw

history blame

7.39 kB

	import re
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from transformers import (
	pipeline,
	AutoModelForSequenceClassification,
	AutoTokenizer,
	AutoModelForCausalLM,
	T5Tokenizer,
	T5ForConditionalGeneration,
	)
	from sentence_transformers import SentenceTransformer
	from bertopic import BERTopic
	import faiss
	import numpy as np
	from datasets import load_dataset, Features, Value

	# Initialize FastAPI app
	app = FastAPI()

	# Preprocessing function
	def preprocess_text(text):
	"""
	Cleans and tokenizes text.
	"""
	text = re.sub(r"http\S+\|www\S+\|https\S+", "", text, flags=re.MULTILINE) # Remove URLs
	text = re.sub(r"\s+", " ", text).strip() # Remove extra spaces
	text = re.sub(r"[^\w\s]", "", text) # Remove punctuation
	return text.lower()


	# Content Classification Model
	class ContentClassifier:
	def __init__(self, model_name="bert-base-uncased"):
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
	self.pipeline = pipeline("text-classification", model=self.model, tokenizer=self.tokenizer)

	def classify(self, text):
	"""
	Classifies text into predefined categories.
	"""
	result = self.pipeline(text)
	return result


	# Relevance Detection Model
	class RelevanceDetector:
	def __init__(self, model_name="bert-base-uncased"):
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
	self.pipeline = pipeline("text-classification", model=self.model, tokenizer=self.tokenizer)

	def detect_relevance(self, text, threshold=0.5):
	"""
	Detects whether a text is relevant to a specific domain.
	"""
	result = self.pipeline(text)
	return result[0]["label"] == "RELEVANT" and result[0]["score"] > threshold


	# Topic Extraction Model using BERTopic
	class TopicExtractor:
	def __init__(self):
	self.model = BERTopic()

	def extract_topics(self, documents):
	"""
	Extracts topics from a list of documents.
	"""
	topics, probs = self.model.fit_transform(documents)
	return self.model.get_topic_info()


	# Summarization Model
	class Summarizer:
	def __init__(self, model_name="t5-small"):
	self.tokenizer = T5Tokenizer.from_pretrained(model_name)
	self.model = T5ForConditionalGeneration.from_pretrained(model_name)

	def summarize(self, text, max_length=100):
	"""
	Summarizes a given text.
	"""
	inputs = self.tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
	summary_ids = self.model.generate(inputs, max_length=max_length, min_length=25, length_penalty=2.0, num_beams=4)
	summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
	return summary


	# Search and Recommendation Model using FAISS
	class SearchEngine:
	def __init__(self, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
	self.model = SentenceTransformer(embedding_model)
	self.index = None
	self.documents = []

	def build_index(self, docs):
	"""
	Builds a FAISS index for document retrieval.
	"""
	self.documents = docs
	embeddings = self.model.encode(docs, convert_to_tensor=True, show_progress_bar=True)
	self.index = faiss.IndexFlatL2(embeddings.shape[1])
	self.index.add(embeddings.cpu().detach().numpy())

	def search(self, query, top_k=5):
	"""
	Searches the index for the top_k most relevant documents.
	"""
	query_embedding = self.model.encode(query, convert_to_tensor=True)
	distances, indices = self.index.search(query_embedding.cpu().detach().numpy().reshape(1, -1), top_k)

	# Convert NumPy data types to native Python types
	results = []
	for i in indices[0]:
	document = self.documents[i]
	distance = float(distances[0][i]) # Convert numpy.float32 to float
	results.append({"document": document, "distance": distance})

	return results





	# Initialize models
	classifier = ContentClassifier()
	relevance_detector = RelevanceDetector()
	summarizer = Summarizer()
	search_engine = SearchEngine()
	topic_extractor = TopicExtractor()
	chatbot = Chatbot()

	# Initialize the search engine with a sample dataset
	documents = [
	"This video explains Instagram growth hacks.",
	"Learn how to use hashtags effectively on Instagram.",
	"Collaborations are key to growing your Instagram audience."
	]
	search_engine.build_index(documents)

	# Define the schema
	features = Features({
	"video_id": Value("string"),
	"video_link": Value("string"),
	"title": Value("string"),
	"text": Value("string"),
	"channel": Value("string"),
	"channel_id": Value("string"),
	"date": Value("string"),
	"license": Value("string"),
	"original_language": Value("string"),
	"source_language": Value("string"),
	"transcription_language": Value("string"),
	"word_count": Value("int64"),
	"character_count": Value("int64"),
	})

	# Load the dataset from Hugging Face Hub
	try:
	dataset = load_dataset(
	"PleIAs/YouTube-Commons",
	features=features,
	streaming=True,
	)

	# Process the dataset
	for example in dataset["train"]:
	print(example) # Process each example
	break # Stop after the first example for demonstration
	except Exception as e:
	print(f"Error loading dataset: {e}")

	# Pydantic models for request validation
	class TextRequest(BaseModel):
	text: str


	class QueryRequest(BaseModel):
	query: str


	class PromptRequest(BaseModel):
	prompt: str


	# API Endpoints
	@app.post("/classify")
	async def classify(request: TextRequest):
	text = request.text
	if not text:
	raise HTTPException(status_code=400, detail="No text provided")

	result = classifier.classify(text)
	return {"result": result}


	@app.post("/relevance")
	async def relevance(request: TextRequest):
	text = request.text
	if not text:
	raise HTTPException(status_code=400, detail="No text provided")

	relevant = relevance_detector.detect_relevance(text)
	return {"relevant": relevant}


	@app.post("/summarize")
	async def summarize(request: TextRequest):
	text = request.text
	if not text:
	raise HTTPException(status_code=400, detail="No text provided")

	summary = summarizer.summarize(text)
	return {"summary": summary}


	@app.post("/search")
	async def search(request: QueryRequest):
	query = request.query
	if not query:
	raise HTTPException(status_code=400, detail="No query provided")

	try:
	results = search_engine.search(query)
	return {"results": results}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/topics")
	async def topics(request: TextRequest):
	text = request.text
	if not text:
	raise HTTPException(status_code=400, detail="No text provided")

	result = topic_extractor.extract_topics([text])
	return {"topics": result.to_dict()}


	# Start the FastAPI app
	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)