Spaces:
Sleeping
Sleeping
T Le
commited on
Commit
·
e3b3c4c
1
Parent(s):
6d81f16
Update to upstream
Browse files- images/summarization.png +0 -0
- images/wordcloud.png +0 -0
- pages/1 Scattertext.py +4 -3
- pages/2 Topic Modeling.py +674 -671
- pages/3 Bidirected Network.py +7 -7
- pages/4 Sunburst.py +25 -19
- pages/5 Burst Detection.py +74 -47
- pages/6 Keywords Stem.py +9 -3
- pages/7 Sentiment Analysis.py +19 -12
- pages/8 Shifterator.py +6 -5
images/summarization.png
ADDED
|
images/wordcloud.png
ADDED
|
pages/1 Scattertext.py
CHANGED
|
@@ -361,8 +361,9 @@ if uploaded_file is not None:
|
|
| 361 |
'Choose column',
|
| 362 |
(df_col_sel), on_change=reset_all)
|
| 363 |
|
| 364 |
-
list_words = paper[column_selected].
|
| 365 |
-
|
|
|
|
| 366 |
|
| 367 |
if column_selected is not None:
|
| 368 |
label1 = col2.selectbox(
|
|
@@ -444,7 +445,7 @@ if uploaded_file is not None:
|
|
| 444 |
st.write("Click the :blue[Download SVG] on the right side.")
|
| 445 |
st.divider()
|
| 446 |
st.subheader(':blue[Scattertext Dataframe]', anchor=False)
|
| 447 |
-
st.button('📥 Click to download result')
|
| 448 |
st.text("Click the Download button to get the CSV result.")
|
| 449 |
|
| 450 |
except NameError:
|
|
|
|
| 361 |
'Choose column',
|
| 362 |
(df_col_sel), on_change=reset_all)
|
| 363 |
|
| 364 |
+
list_words = paper[column_selected].dropna() # remove NaN
|
| 365 |
+
list_words = [w for w in list_words if str(w).strip() != ""] # remove empty strings
|
| 366 |
+
list_unique = sorted(set(list_words))
|
| 367 |
|
| 368 |
if column_selected is not None:
|
| 369 |
label1 = col2.selectbox(
|
|
|
|
| 445 |
st.write("Click the :blue[Download SVG] on the right side.")
|
| 446 |
st.divider()
|
| 447 |
st.subheader(':blue[Scattertext Dataframe]', anchor=False)
|
| 448 |
+
st.button('📥 Click to download result', on_click="ignore")
|
| 449 |
st.text("Click the Download button to get the CSV result.")
|
| 450 |
|
| 451 |
except NameError:
|
pages/2 Topic Modeling.py
CHANGED
|
@@ -1,671 +1,674 @@
|
|
| 1 |
-
#import module
|
| 2 |
-
import streamlit as st
|
| 3 |
-
import streamlit.components.v1 as components
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import numpy as np
|
| 6 |
-
import re
|
| 7 |
-
import string
|
| 8 |
-
import nltk
|
| 9 |
-
nltk.download('wordnet')
|
| 10 |
-
from nltk.stem import WordNetLemmatizer
|
| 11 |
-
nltk.download('stopwords')
|
| 12 |
-
from nltk.corpus import stopwords
|
| 13 |
-
import gensim
|
| 14 |
-
import gensim.corpora as corpora
|
| 15 |
-
from gensim.corpora import Dictionary
|
| 16 |
-
from gensim.models.coherencemodel import CoherenceModel
|
| 17 |
-
from gensim.models.ldamodel import LdaModel
|
| 18 |
-
from gensim.models import Phrases
|
| 19 |
-
from gensim.models.phrases import Phraser
|
| 20 |
-
from pprint import pprint
|
| 21 |
-
import pickle
|
| 22 |
-
import pyLDAvis
|
| 23 |
-
import pyLDAvis.gensim_models as gensimvis
|
| 24 |
-
from io import StringIO
|
| 25 |
-
from ipywidgets.embed import embed_minimal_html
|
| 26 |
-
from nltk.stem.snowball import SnowballStemmer
|
| 27 |
-
from bertopic import BERTopic
|
| 28 |
-
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
|
| 29 |
-
import plotly.express as px
|
| 30 |
-
from sklearn.cluster import KMeans
|
| 31 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
| 32 |
-
import bitermplus as btm
|
| 33 |
-
import tmplot as tmp
|
| 34 |
-
import tomotopy
|
| 35 |
-
import sys
|
| 36 |
-
import spacy
|
| 37 |
-
import en_core_web_sm
|
| 38 |
-
import pipeline
|
| 39 |
-
from html2image import Html2Image
|
| 40 |
-
from umap import UMAP
|
| 41 |
-
import os
|
| 42 |
-
import time
|
| 43 |
-
import json
|
| 44 |
-
from tools import sourceformat as sf
|
| 45 |
-
import datamapplot
|
| 46 |
-
from sentence_transformers import SentenceTransformer
|
| 47 |
-
import openai
|
| 48 |
-
from transformers import pipeline
|
| 49 |
-
|
| 50 |
-
#===config===
|
| 51 |
-
st.set_page_config(
|
| 52 |
-
page_title="Coconut",
|
| 53 |
-
page_icon="🥥",
|
| 54 |
-
layout="wide",
|
| 55 |
-
initial_sidebar_state="collapsed"
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
hide_streamlit_style = """
|
| 59 |
-
<style>
|
| 60 |
-
#MainMenu
|
| 61 |
-
{visibility: hidden;}
|
| 62 |
-
footer {visibility: hidden;}
|
| 63 |
-
[data-testid="collapsedControl"] {display: none}
|
| 64 |
-
</style>
|
| 65 |
-
"""
|
| 66 |
-
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
| 67 |
-
|
| 68 |
-
with st.popover("🔗 Menu"):
|
| 69 |
-
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
| 70 |
-
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
| 71 |
-
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
| 72 |
-
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
| 73 |
-
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
| 74 |
-
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
| 75 |
-
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
| 76 |
-
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
| 77 |
-
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
| 78 |
-
st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
|
| 79 |
-
st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "🔟")
|
| 80 |
-
|
| 81 |
-
st.header("Topic Modeling", anchor=False)
|
| 82 |
-
st.subheader('Put your file here...', anchor=False)
|
| 83 |
-
|
| 84 |
-
#========unique id========
|
| 85 |
-
@st.cache_resource(ttl=3600)
|
| 86 |
-
def create_list():
|
| 87 |
-
l = [1, 2, 3]
|
| 88 |
-
return l
|
| 89 |
-
|
| 90 |
-
l = create_list()
|
| 91 |
-
first_list_value = l[0]
|
| 92 |
-
l[0] = first_list_value + 1
|
| 93 |
-
uID = str(l[0])
|
| 94 |
-
|
| 95 |
-
@st.cache_data(ttl=3600)
|
| 96 |
-
def get_ext(uploaded_file):
|
| 97 |
-
extype = uID+uploaded_file.name
|
| 98 |
-
return extype
|
| 99 |
-
|
| 100 |
-
#===clear cache===
|
| 101 |
-
|
| 102 |
-
def reset_biterm():
|
| 103 |
-
try:
|
| 104 |
-
biterm_map.clear()
|
| 105 |
-
biterm_bar.clear()
|
| 106 |
-
except NameError:
|
| 107 |
-
biterm_topic.clear()
|
| 108 |
-
|
| 109 |
-
def reset_all():
|
| 110 |
-
st.cache_data.clear()
|
| 111 |
-
|
| 112 |
-
#===avoiding deadlock===
|
| 113 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 114 |
-
|
| 115 |
-
#===upload file===
|
| 116 |
-
@st.cache_data(ttl=3600)
|
| 117 |
-
def upload(file):
|
| 118 |
-
papers = pd.read_csv(uploaded_file)
|
| 119 |
-
if "About the data" in papers.columns[0]:
|
| 120 |
-
papers = sf.dim(papers)
|
| 121 |
-
col_dict = {'MeSH terms': 'Keywords',
|
| 122 |
-
'PubYear': 'Year',
|
| 123 |
-
'Times cited': 'Cited by',
|
| 124 |
-
'Publication Type': 'Document Type'
|
| 125 |
-
}
|
| 126 |
-
papers.rename(columns=col_dict, inplace=True)
|
| 127 |
-
|
| 128 |
-
return papers
|
| 129 |
-
|
| 130 |
-
@st.cache_data(ttl=3600)
|
| 131 |
-
def conv_txt(extype):
|
| 132 |
-
if("PMID" in (uploaded_file.read()).decode()):
|
| 133 |
-
uploaded_file.seek(0)
|
| 134 |
-
papers = sf.medline(uploaded_file)
|
| 135 |
-
print(papers)
|
| 136 |
-
return papers
|
| 137 |
-
col_dict = {'TI': 'Title',
|
| 138 |
-
'SO': 'Source title',
|
| 139 |
-
'DE': 'Author Keywords',
|
| 140 |
-
'DT': 'Document Type',
|
| 141 |
-
'AB': 'Abstract',
|
| 142 |
-
'TC': 'Cited by',
|
| 143 |
-
'PY': 'Year',
|
| 144 |
-
'ID': 'Keywords Plus',
|
| 145 |
-
'rights_date_used': 'Year'}
|
| 146 |
-
uploaded_file.seek(0)
|
| 147 |
-
papers = pd.read_csv(uploaded_file, sep='\t')
|
| 148 |
-
if("htid" in papers.columns):
|
| 149 |
-
papers = sf.htrc(papers)
|
| 150 |
-
papers.rename(columns=col_dict, inplace=True)
|
| 151 |
-
print(papers)
|
| 152 |
-
return papers
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
@st.cache_data(ttl=3600)
|
| 156 |
-
def conv_json(extype):
|
| 157 |
-
col_dict={'title': 'title',
|
| 158 |
-
'rights_date_used': 'Year',
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
data = json.load(uploaded_file)
|
| 162 |
-
hathifile = data['gathers']
|
| 163 |
-
keywords = pd.DataFrame.from_records(hathifile)
|
| 164 |
-
|
| 165 |
-
keywords = sf.htrc(keywords)
|
| 166 |
-
keywords.rename(columns=col_dict,inplace=True)
|
| 167 |
-
return keywords
|
| 168 |
-
|
| 169 |
-
@st.
|
| 170 |
-
def conv_pub(extype):
|
| 171 |
-
if (get_ext(extype)).endswith('.tar.gz'):
|
| 172 |
-
bytedata = extype.read()
|
| 173 |
-
keywords = sf.readPub(bytedata)
|
| 174 |
-
elif (get_ext(extype)).endswith('.xml'):
|
| 175 |
-
bytedata = extype.read()
|
| 176 |
-
keywords = sf.readxml(bytedata)
|
| 177 |
-
return keywords
|
| 178 |
-
|
| 179 |
-
#===Read data===
|
| 180 |
-
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
|
| 181 |
-
|
| 182 |
-
if uploaded_file is not None:
|
| 183 |
-
try:
|
| 184 |
-
extype = get_ext(uploaded_file)
|
| 185 |
-
|
| 186 |
-
if extype.endswith('.csv'):
|
| 187 |
-
papers = upload(extype)
|
| 188 |
-
elif extype.endswith('.txt'):
|
| 189 |
-
papers = conv_txt(extype)
|
| 190 |
-
|
| 191 |
-
elif extype.endswith('.json'):
|
| 192 |
-
papers = conv_json(extype)
|
| 193 |
-
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
| 194 |
-
papers = conv_pub(uploaded_file)
|
| 195 |
-
|
| 196 |
-
coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
| 197 |
-
|
| 198 |
-
c1, c2, c3 = st.columns([3,3,4])
|
| 199 |
-
method = c1.selectbox(
|
| 200 |
-
'Choose method',
|
| 201 |
-
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
|
| 202 |
-
ColCho = c2.selectbox('Choose column', (["Abstract","Title", "Abstract + Title"]))
|
| 203 |
-
num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
| 204 |
-
|
| 205 |
-
d1, d2 = st.columns([3,7])
|
| 206 |
-
xgram = d1.selectbox("N-grams", ("1", "2", "3"))
|
| 207 |
-
xgram = int(xgram)
|
| 208 |
-
words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)")
|
| 209 |
-
|
| 210 |
-
rem_copyright = d1.toggle('Remove copyright statement', value=True)
|
| 211 |
-
rem_punc = d2.toggle('Remove punctuation', value=True)
|
| 212 |
-
|
| 213 |
-
#===advance settings===
|
| 214 |
-
with st.expander("🧮 Show advance settings"):
|
| 215 |
-
t1, t2, t3 = st.columns(
|
| 216 |
-
if method == 'pyLDA':
|
| 217 |
-
py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1)
|
| 218 |
-
py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1)
|
| 219 |
-
opt_threshold = t3.number_input('Threshold', value=100 , min_value=1, max_value=None, step=1)
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
paper['Abstract_pre'] = paper['Abstract_pre'].
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
hti.
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
resultf =
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
with
|
| 395 |
-
st.markdown('**
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
st.markdown('**
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
st.
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
st.
|
| 405 |
-
st.
|
| 406 |
-
st.
|
| 407 |
-
st.
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
st.altair_chart(
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
except
|
| 484 |
-
st.
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
st.markdown('**
|
| 491 |
-
|
| 492 |
-
st.markdown('**Li, J., Chen, W. H.,
|
| 493 |
-
|
| 494 |
-
st.
|
| 495 |
-
st.
|
| 496 |
-
|
| 497 |
-
st.
|
| 498 |
-
st.
|
| 499 |
-
st.
|
| 500 |
-
st.
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
with
|
| 656 |
-
st.markdown('**
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
st.
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
st.
|
| 664 |
-
st.
|
| 665 |
-
st.
|
| 666 |
-
st.
|
| 667 |
-
st.
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#import module
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import streamlit.components.v1 as components
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
import re
|
| 7 |
+
import string
|
| 8 |
+
import nltk
|
| 9 |
+
nltk.download('wordnet')
|
| 10 |
+
from nltk.stem import WordNetLemmatizer
|
| 11 |
+
nltk.download('stopwords')
|
| 12 |
+
from nltk.corpus import stopwords
|
| 13 |
+
import gensim
|
| 14 |
+
import gensim.corpora as corpora
|
| 15 |
+
from gensim.corpora import Dictionary
|
| 16 |
+
from gensim.models.coherencemodel import CoherenceModel
|
| 17 |
+
from gensim.models.ldamodel import LdaModel
|
| 18 |
+
from gensim.models import Phrases
|
| 19 |
+
from gensim.models.phrases import Phraser
|
| 20 |
+
from pprint import pprint
|
| 21 |
+
import pickle
|
| 22 |
+
import pyLDAvis
|
| 23 |
+
import pyLDAvis.gensim_models as gensimvis
|
| 24 |
+
from io import StringIO
|
| 25 |
+
from ipywidgets.embed import embed_minimal_html
|
| 26 |
+
from nltk.stem.snowball import SnowballStemmer
|
| 27 |
+
from bertopic import BERTopic
|
| 28 |
+
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, TextGeneration
|
| 29 |
+
import plotly.express as px
|
| 30 |
+
from sklearn.cluster import KMeans
|
| 31 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 32 |
+
import bitermplus as btm
|
| 33 |
+
import tmplot as tmp
|
| 34 |
+
import tomotopy
|
| 35 |
+
import sys
|
| 36 |
+
import spacy
|
| 37 |
+
import en_core_web_sm
|
| 38 |
+
import pipeline
|
| 39 |
+
from html2image import Html2Image
|
| 40 |
+
from umap import UMAP
|
| 41 |
+
import os
|
| 42 |
+
import time
|
| 43 |
+
import json
|
| 44 |
+
from tools import sourceformat as sf
|
| 45 |
+
import datamapplot
|
| 46 |
+
from sentence_transformers import SentenceTransformer
|
| 47 |
+
import openai
|
| 48 |
+
from transformers import pipeline
|
| 49 |
+
|
| 50 |
+
#===config===
|
| 51 |
+
st.set_page_config(
|
| 52 |
+
page_title="Coconut",
|
| 53 |
+
page_icon="🥥",
|
| 54 |
+
layout="wide",
|
| 55 |
+
initial_sidebar_state="collapsed"
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
hide_streamlit_style = """
|
| 59 |
+
<style>
|
| 60 |
+
#MainMenu
|
| 61 |
+
{visibility: hidden;}
|
| 62 |
+
footer {visibility: hidden;}
|
| 63 |
+
[data-testid="collapsedControl"] {display: none}
|
| 64 |
+
</style>
|
| 65 |
+
"""
|
| 66 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
| 67 |
+
|
| 68 |
+
with st.popover("🔗 Menu"):
|
| 69 |
+
st.page_link("https://www.coconut-libtool.com/", label="Home", icon="🏠")
|
| 70 |
+
st.page_link("pages/1 Scattertext.py", label="Scattertext", icon="1️⃣")
|
| 71 |
+
st.page_link("pages/2 Topic Modeling.py", label="Topic Modeling", icon="2️⃣")
|
| 72 |
+
st.page_link("pages/3 Bidirected Network.py", label="Bidirected Network", icon="3️⃣")
|
| 73 |
+
st.page_link("pages/4 Sunburst.py", label="Sunburst", icon="4️⃣")
|
| 74 |
+
st.page_link("pages/5 Burst Detection.py", label="Burst Detection", icon="5️⃣")
|
| 75 |
+
st.page_link("pages/6 Keywords Stem.py", label="Keywords Stem", icon="6️⃣")
|
| 76 |
+
st.page_link("pages/7 Sentiment Analysis.py", label="Sentiment Analysis", icon="7️⃣")
|
| 77 |
+
st.page_link("pages/8 Shifterator.py", label="Shifterator", icon="8️⃣")
|
| 78 |
+
st.page_link("pages/9 Summarization.py", label = "Summarization",icon ="9️⃣")
|
| 79 |
+
st.page_link("pages/10 WordCloud.py", label = "WordCloud", icon = "🔟")
|
| 80 |
+
|
| 81 |
+
st.header("Topic Modeling", anchor=False)
|
| 82 |
+
st.subheader('Put your file here...', anchor=False)
|
| 83 |
+
|
| 84 |
+
#========unique id========
|
| 85 |
+
@st.cache_resource(ttl=3600)
|
| 86 |
+
def create_list():
|
| 87 |
+
l = [1, 2, 3]
|
| 88 |
+
return l
|
| 89 |
+
|
| 90 |
+
l = create_list()
|
| 91 |
+
first_list_value = l[0]
|
| 92 |
+
l[0] = first_list_value + 1
|
| 93 |
+
uID = str(l[0])
|
| 94 |
+
|
| 95 |
+
@st.cache_data(ttl=3600)
|
| 96 |
+
def get_ext(uploaded_file):
|
| 97 |
+
extype = uID+uploaded_file.name
|
| 98 |
+
return extype
|
| 99 |
+
|
| 100 |
+
#===clear cache===
|
| 101 |
+
|
| 102 |
+
def reset_biterm():
|
| 103 |
+
try:
|
| 104 |
+
biterm_map.clear()
|
| 105 |
+
biterm_bar.clear()
|
| 106 |
+
except NameError:
|
| 107 |
+
biterm_topic.clear()
|
| 108 |
+
|
| 109 |
+
def reset_all():
|
| 110 |
+
st.cache_data.clear()
|
| 111 |
+
|
| 112 |
+
#===avoiding deadlock===
|
| 113 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 114 |
+
|
| 115 |
+
#===upload file===
|
| 116 |
+
@st.cache_data(ttl=3600)
|
| 117 |
+
def upload(file):
|
| 118 |
+
papers = pd.read_csv(uploaded_file)
|
| 119 |
+
if "About the data" in papers.columns[0]:
|
| 120 |
+
papers = sf.dim(papers)
|
| 121 |
+
col_dict = {'MeSH terms': 'Keywords',
|
| 122 |
+
'PubYear': 'Year',
|
| 123 |
+
'Times cited': 'Cited by',
|
| 124 |
+
'Publication Type': 'Document Type'
|
| 125 |
+
}
|
| 126 |
+
papers.rename(columns=col_dict, inplace=True)
|
| 127 |
+
|
| 128 |
+
return papers
|
| 129 |
+
|
| 130 |
+
@st.cache_data(ttl=3600)
|
| 131 |
+
def conv_txt(extype):
|
| 132 |
+
if("PMID" in (uploaded_file.read()).decode()):
|
| 133 |
+
uploaded_file.seek(0)
|
| 134 |
+
papers = sf.medline(uploaded_file)
|
| 135 |
+
print(papers)
|
| 136 |
+
return papers
|
| 137 |
+
col_dict = {'TI': 'Title',
|
| 138 |
+
'SO': 'Source title',
|
| 139 |
+
'DE': 'Author Keywords',
|
| 140 |
+
'DT': 'Document Type',
|
| 141 |
+
'AB': 'Abstract',
|
| 142 |
+
'TC': 'Cited by',
|
| 143 |
+
'PY': 'Year',
|
| 144 |
+
'ID': 'Keywords Plus',
|
| 145 |
+
'rights_date_used': 'Year'}
|
| 146 |
+
uploaded_file.seek(0)
|
| 147 |
+
papers = pd.read_csv(uploaded_file, sep='\t')
|
| 148 |
+
if("htid" in papers.columns):
|
| 149 |
+
papers = sf.htrc(papers)
|
| 150 |
+
papers.rename(columns=col_dict, inplace=True)
|
| 151 |
+
print(papers)
|
| 152 |
+
return papers
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@st.cache_data(ttl=3600)
|
| 156 |
+
def conv_json(extype):
|
| 157 |
+
col_dict={'title': 'title',
|
| 158 |
+
'rights_date_used': 'Year',
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
data = json.load(uploaded_file)
|
| 162 |
+
hathifile = data['gathers']
|
| 163 |
+
keywords = pd.DataFrame.from_records(hathifile)
|
| 164 |
+
|
| 165 |
+
keywords = sf.htrc(keywords)
|
| 166 |
+
keywords.rename(columns=col_dict,inplace=True)
|
| 167 |
+
return keywords
|
| 168 |
+
|
| 169 |
+
@st.cache_data(ttl=3600)
|
| 170 |
+
def conv_pub(extype):
|
| 171 |
+
if (get_ext(extype)).endswith('.tar.gz'):
|
| 172 |
+
bytedata = extype.read()
|
| 173 |
+
keywords = sf.readPub(bytedata)
|
| 174 |
+
elif (get_ext(extype)).endswith('.xml'):
|
| 175 |
+
bytedata = extype.read()
|
| 176 |
+
keywords = sf.readxml(bytedata)
|
| 177 |
+
return keywords
|
| 178 |
+
|
| 179 |
+
#===Read data===
|
| 180 |
+
uploaded_file = st.file_uploader('', type=['csv', 'txt','json','tar.gz','xml'], on_change=reset_all)
|
| 181 |
+
|
| 182 |
+
if uploaded_file is not None:
|
| 183 |
+
try:
|
| 184 |
+
extype = get_ext(uploaded_file)
|
| 185 |
+
|
| 186 |
+
if extype.endswith('.csv'):
|
| 187 |
+
papers = upload(extype)
|
| 188 |
+
elif extype.endswith('.txt'):
|
| 189 |
+
papers = conv_txt(extype)
|
| 190 |
+
|
| 191 |
+
elif extype.endswith('.json'):
|
| 192 |
+
papers = conv_json(extype)
|
| 193 |
+
elif extype.endswith('.tar.gz') or extype.endswith('.xml'):
|
| 194 |
+
papers = conv_pub(uploaded_file)
|
| 195 |
+
|
| 196 |
+
coldf = sorted(papers.select_dtypes(include=['object']).columns.tolist())
|
| 197 |
+
|
| 198 |
+
c1, c2, c3 = st.columns([3,3,4])
|
| 199 |
+
method = c1.selectbox(
|
| 200 |
+
'Choose method',
|
| 201 |
+
('Choose...', 'pyLDA', 'Biterm', 'BERTopic'))
|
| 202 |
+
ColCho = c2.selectbox('Choose column', (["Abstract","Title", "Abstract + Title"]))
|
| 203 |
+
num_cho = c3.number_input('Choose number of topics', min_value=2, max_value=30, value=5)
|
| 204 |
+
|
| 205 |
+
d1, d2 = st.columns([3,7])
|
| 206 |
+
xgram = d1.selectbox("N-grams", ("1", "2", "3"), on_change=reset_all)
|
| 207 |
+
xgram = int(xgram)
|
| 208 |
+
words_to_remove = d2.text_input("Remove specific words. Separate words by semicolons (;)", on_change=reset_all)
|
| 209 |
+
|
| 210 |
+
rem_copyright = d1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
| 211 |
+
rem_punc = d2.toggle('Remove punctuation', value=True, on_change=reset_all)
|
| 212 |
+
|
| 213 |
+
#===advance settings===
|
| 214 |
+
with st.expander("🧮 Show advance settings"):
|
| 215 |
+
t1, t2, t3, t4 = st.columns(4)
|
| 216 |
+
if method == 'pyLDA':
|
| 217 |
+
py_random_state = t1.number_input('Random state', min_value=0, max_value=None, step=1, help='Ensuring the reproducibility.')
|
| 218 |
+
py_chunksize = t2.number_input('Chunk size', value=100 , min_value=10, max_value=None, step=1, help='Number of documents to be used in each training chunk.')
|
| 219 |
+
opt_threshold = t3.number_input('Threshold (Gensim)', value=100 , min_value=1, max_value=None, step=1, help='Lower = More phrases. Higher = Fewer phrases.')
|
| 220 |
+
opt_relevance = t4.number_input('Lambda (λ)', value=0.6 , min_value=0.0, max_value=1.0, step=0.01, help='Lower = More unique. Higher = More frequent.')
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
elif method == 'Biterm':
|
| 224 |
+
btm_seed = t1.number_input('Random state seed', value=100 , min_value=1, max_value=None, step=1, help='Ensuring the reproducibility.')
|
| 225 |
+
btm_iterations = t2.number_input('Iterations number', value=20 , min_value=2, max_value=None, step=1, help='Number of iterations the model fitting process has gone through.')
|
| 226 |
+
opt_threshold = t3.number_input('Threshold (Gensim)', value=100 , min_value=1, max_value=None, step=1, help='Lower = More phrases. Higher = Fewer phrases.')
|
| 227 |
+
|
| 228 |
+
elif method == 'BERTopic':
|
| 229 |
+
#u1, u2 = st.columns([5,5])
|
| 230 |
+
|
| 231 |
+
bert_top_n_words = t1.number_input('top_n_words', value=5 , min_value=5, max_value=25, step=1, help='Number of words per topic.')
|
| 232 |
+
bert_random_state = t2.number_input('random_state', value=42 , min_value=1, max_value=None, step=1, help="Please be aware we currently can't do the reproducibility on Bertopic.")
|
| 233 |
+
bert_n_components = t3.number_input('n_components', value=5 , min_value=1, max_value=None, step=1, help='The dimensionality of the embeddings after reducing them.')
|
| 234 |
+
bert_n_neighbors = t4.number_input('n_neighbors', value=15 , min_value=1, max_value=None, step=1, help='The number of neighboring sample points used when making the manifold approximation.')
|
| 235 |
+
bert_embedding_model = st.radio(
|
| 236 |
+
"embedding_model",
|
| 237 |
+
["all-MiniLM-L6-v2", "paraphrase-multilingual-MiniLM-L12-v2", "en_core_web_sm"], index=0, horizontal=True, help= 'Select paraphrase-multilingual if your documents are in a language other than English or are multilingual.')
|
| 238 |
+
|
| 239 |
+
fine_tuning = st.toggle("Use Fine-tuning")
|
| 240 |
+
if fine_tuning:
|
| 241 |
+
topic_labelling = st.toggle("Automatic topic labelling")
|
| 242 |
+
if topic_labelling:
|
| 243 |
+
llm_provider = st.selectbox("Model",["OpenAI/gpt-4o","Google/flan-t5","LiquidAI/LFM2-350M"])
|
| 244 |
+
if llm_provider == "OpenAI/gpt-4o":
|
| 245 |
+
api_key = st.text_input("API Key")
|
| 246 |
+
|
| 247 |
+
else:
|
| 248 |
+
st.write('Please choose your preferred method')
|
| 249 |
+
|
| 250 |
+
#===clean csv===
|
| 251 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 252 |
+
def clean_csv(extype):
|
| 253 |
+
if (ColCho=="Abstract + Title"):
|
| 254 |
+
papers["Abstract + Title"] = papers["Title"] + " " + papers["Abstract"]
|
| 255 |
+
st.write(papers["Abstract + Title"])
|
| 256 |
+
|
| 257 |
+
paper = papers.dropna(subset=[ColCho])
|
| 258 |
+
|
| 259 |
+
#===mapping===
|
| 260 |
+
paper['Abstract_pre'] = paper[ColCho].map(lambda x: x.lower())
|
| 261 |
+
if rem_punc:
|
| 262 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(
|
| 263 |
+
lambda x: re.sub(f"[{re.escape(string.punctuation)}]", " ", x)
|
| 264 |
+
).map(lambda x: re.sub(r"\s+", " ", x).strip())
|
| 265 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].str.replace('[\u2018\u2019\u201c\u201d]', '', regex=True)
|
| 266 |
+
if rem_copyright:
|
| 267 |
+
paper['Abstract_pre'] = paper['Abstract_pre'].map(lambda x: re.sub('©.*', '', x))
|
| 268 |
+
|
| 269 |
+
#===stopword removal===
|
| 270 |
+
stop = stopwords.words('english')
|
| 271 |
+
paper['Abstract_stop'] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
| 272 |
+
|
| 273 |
+
#===lemmatize===
|
| 274 |
+
lemmatizer = WordNetLemmatizer()
|
| 275 |
+
|
| 276 |
+
@st.cache_data(ttl=3600)
|
| 277 |
+
def lemmatize_words(text):
|
| 278 |
+
words = text.split()
|
| 279 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
| 280 |
+
return ' '.join(words)
|
| 281 |
+
paper['Abstract_lem'] = paper['Abstract_stop'].apply(lemmatize_words)
|
| 282 |
+
|
| 283 |
+
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
| 284 |
+
remove_dict = {word: None for word in words_rmv}
|
| 285 |
+
|
| 286 |
+
@st.cache_data(ttl=3600)
|
| 287 |
+
def remove_words(text):
|
| 288 |
+
words = text.split()
|
| 289 |
+
cleaned_words = [word for word in words if word not in remove_dict]
|
| 290 |
+
return ' '.join(cleaned_words)
|
| 291 |
+
paper['Abstract_lem'] = paper['Abstract_lem'].map(remove_words)
|
| 292 |
+
|
| 293 |
+
topic_abs = paper.Abstract_lem.values.tolist()
|
| 294 |
+
return topic_abs, paper
|
| 295 |
+
|
| 296 |
+
topic_abs, paper=clean_csv(extype)
|
| 297 |
+
|
| 298 |
+
if st.button("Submit", on_click=reset_all):
|
| 299 |
+
num_topic = num_cho
|
| 300 |
+
|
| 301 |
+
if method == 'BERTopic':
|
| 302 |
+
st.info('BERTopic is an expensive process when dealing with a large volume of text with our existing resources. Please kindly wait until the visualization appears.', icon="ℹ️")
|
| 303 |
+
|
| 304 |
+
#===topic===
|
| 305 |
+
if method == 'Choose...':
|
| 306 |
+
st.write('')
|
| 307 |
+
|
| 308 |
+
elif method == 'pyLDA':
|
| 309 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 310 |
+
|
| 311 |
+
with tab1:
|
| 312 |
+
#===visualization===
|
| 313 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 314 |
+
def pylda(extype):
|
| 315 |
+
topic_abs_LDA = [t.split(' ') for t in topic_abs]
|
| 316 |
+
|
| 317 |
+
bigram = Phrases(topic_abs_LDA, min_count=xgram, threshold=opt_threshold)
|
| 318 |
+
trigram = Phrases(bigram[topic_abs_LDA], threshold=opt_threshold)
|
| 319 |
+
bigram_mod = Phraser(bigram)
|
| 320 |
+
trigram_mod = Phraser(trigram)
|
| 321 |
+
|
| 322 |
+
topic_abs_LDA = [trigram_mod[bigram_mod[doc]] for doc in topic_abs_LDA]
|
| 323 |
+
|
| 324 |
+
id2word = Dictionary(topic_abs_LDA)
|
| 325 |
+
corpus = [id2word.doc2bow(text) for text in topic_abs_LDA]
|
| 326 |
+
#===LDA===
|
| 327 |
+
lda_model = LdaModel(corpus=corpus,
|
| 328 |
+
id2word=id2word,
|
| 329 |
+
num_topics=num_topic,
|
| 330 |
+
random_state=py_random_state,
|
| 331 |
+
chunksize=py_chunksize,
|
| 332 |
+
alpha='auto',
|
| 333 |
+
gamma_threshold=opt_relevance,
|
| 334 |
+
per_word_topics=False)
|
| 335 |
+
pprint(lda_model.print_topics())
|
| 336 |
+
doc_lda = lda_model[corpus]
|
| 337 |
+
topics = lda_model.show_topics(num_words = 30,formatted=False)
|
| 338 |
+
|
| 339 |
+
#===visualization===
|
| 340 |
+
coherence_model_lda = CoherenceModel(model=lda_model, texts=topic_abs_LDA, dictionary=id2word, coherence='c_v')
|
| 341 |
+
coherence_lda = coherence_model_lda.get_coherence()
|
| 342 |
+
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
|
| 343 |
+
py_lda_vis_html = pyLDAvis.prepared_data_to_html(vis)
|
| 344 |
+
return py_lda_vis_html, coherence_lda, vis, topics
|
| 345 |
+
|
| 346 |
+
with st.spinner('Performing computations. Please wait ...'):
|
| 347 |
+
try:
|
| 348 |
+
py_lda_vis_html, coherence_lda, vis, topics = pylda(extype)
|
| 349 |
+
st.write('Coherence score: ', coherence_lda)
|
| 350 |
+
components.html(py_lda_vis_html, width=1500, height=800)
|
| 351 |
+
st.markdown('Copyright (c) 2015, Ben Mabey. https://github.com/bmabey/pyLDAvis')
|
| 352 |
+
|
| 353 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 354 |
+
def img_lda(vis):
|
| 355 |
+
pyLDAvis.save_html(vis, 'output.html')
|
| 356 |
+
hti = Html2Image()
|
| 357 |
+
hti.browser.flags = ['--default-background-color=ffffff', '--hide-scrollbars']
|
| 358 |
+
hti.browser.use_new_headless = None
|
| 359 |
+
css = "body {background: white;}"
|
| 360 |
+
hti.screenshot(
|
| 361 |
+
other_file='output.html', css_str=css, size=(1500, 800),
|
| 362 |
+
save_as='ldavis_img.png'
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
img_lda(vis)
|
| 366 |
+
|
| 367 |
+
d1, d2 = st.columns(2)
|
| 368 |
+
with open("ldavis_img.png", "rb") as file:
|
| 369 |
+
btn = d1.download_button(
|
| 370 |
+
label="Download image",
|
| 371 |
+
data=file,
|
| 372 |
+
file_name="ldavis_img.png",
|
| 373 |
+
mime="image/png"
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
#===download results===#
|
| 377 |
+
resultf = pd.DataFrame(topics)
|
| 378 |
+
#formatting
|
| 379 |
+
resultf = resultf.transpose()
|
| 380 |
+
resultf = resultf.drop([0])
|
| 381 |
+
resultf = resultf.explode(list(range(len(resultf.columns))), ignore_index=False)
|
| 382 |
+
|
| 383 |
+
resultcsv = resultf.to_csv().encode("utf-8")
|
| 384 |
+
d2.download_button(
|
| 385 |
+
label = "Download Results",
|
| 386 |
+
data=resultcsv,
|
| 387 |
+
file_name="results.csv",
|
| 388 |
+
mime="text\csv",
|
| 389 |
+
on_click="ignore")
|
| 390 |
+
|
| 391 |
+
except NameError as f:
|
| 392 |
+
st.warning('🖱️ Please click Submit')
|
| 393 |
+
|
| 394 |
+
with tab2:
|
| 395 |
+
st.markdown('**Sievert, C., & Shirley, K. (2014). LDAvis: A method for visualizing and interpreting topics. Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces.** https://doi.org/10.3115/v1/w14-3110')
|
| 396 |
+
|
| 397 |
+
with tab3:
|
| 398 |
+
st.markdown('**Chen, X., & Wang, H. (2019, January). Automated chat transcript analysis using topic modeling for library reference services. Proceedings of the Association for Information Science and Technology, 56(1), 368–371.** https://doi.org/10.1002/pra2.31')
|
| 399 |
+
st.markdown('**Joo, S., Ingram, E., & Cahill, M. (2021, December 15). Exploring Topics and Genres in Storytime Books: A Text Mining Approach. Evidence Based Library and Information Practice, 16(4), 41–62.** https://doi.org/10.18438/eblip29963')
|
| 400 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Topic Modeling. Text Mining for Information Professionals, 105–137.** https://doi.org/10.1007/978-3-030-85085-2_4')
|
| 401 |
+
st.markdown('**Lamba, M., & Madhusudhan, M. (2019, June 7). Mapping of topics in DESIDOC Journal of Library and Information Technology, India: a study. Scientometrics, 120(2), 477–505.** https://doi.org/10.1007/s11192-019-03137-5')
|
| 402 |
+
|
| 403 |
+
with tab4:
|
| 404 |
+
st.subheader(':blue[pyLDA]', anchor=False)
|
| 405 |
+
st.button('Download image')
|
| 406 |
+
st.text("Click Download Image button.")
|
| 407 |
+
st.divider()
|
| 408 |
+
st.subheader(':blue[Downloading CSV Results]', anchor=False)
|
| 409 |
+
st.button("Download Results")
|
| 410 |
+
st.text("Click Download results button at bottom of page")
|
| 411 |
+
|
| 412 |
+
#===Biterm===
|
| 413 |
+
elif method == 'Biterm':
|
| 414 |
+
|
| 415 |
+
#===optimize Biterm===
|
| 416 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
| 417 |
+
def biterm_topic(extype):
|
| 418 |
+
tokenized_abs = [t.split(' ') for t in topic_abs]
|
| 419 |
+
|
| 420 |
+
bigram = Phrases(tokenized_abs, min_count=xgram, threshold=opt_threshold)
|
| 421 |
+
trigram = Phrases(bigram[tokenized_abs], threshold=opt_threshold)
|
| 422 |
+
bigram_mod = Phraser(bigram)
|
| 423 |
+
trigram_mod = Phraser(trigram)
|
| 424 |
+
|
| 425 |
+
topic_abs_ngram = [trigram_mod[bigram_mod[doc]] for doc in tokenized_abs]
|
| 426 |
+
|
| 427 |
+
topic_abs_str = [' '.join(doc) for doc in topic_abs_ngram]
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
X, vocabulary, vocab_dict = btm.get_words_freqs(topic_abs_str)
|
| 431 |
+
tf = np.array(X.sum(axis=0)).ravel()
|
| 432 |
+
docs_vec = btm.get_vectorized_docs(topic_abs, vocabulary)
|
| 433 |
+
docs_lens = list(map(len, docs_vec))
|
| 434 |
+
biterms = btm.get_biterms(docs_vec)
|
| 435 |
+
|
| 436 |
+
model = btm.BTM(X, vocabulary, seed=btm_seed, T=num_topic, M=20, alpha=50/8, beta=0.01)
|
| 437 |
+
model.fit(biterms, iterations=btm_iterations)
|
| 438 |
+
|
| 439 |
+
p_zd = model.transform(docs_vec)
|
| 440 |
+
coherence = model.coherence_
|
| 441 |
+
phi = tmp.get_phi(model)
|
| 442 |
+
topics_coords = tmp.prepare_coords(model)
|
| 443 |
+
totaltop = topics_coords.label.values.tolist()
|
| 444 |
+
perplexity = model.perplexity_
|
| 445 |
+
top_topics = model.df_words_topics_
|
| 446 |
+
|
| 447 |
+
return topics_coords, phi, totaltop, perplexity, top_topics
|
| 448 |
+
|
| 449 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 450 |
+
with tab1:
|
| 451 |
+
try:
|
| 452 |
+
with st.spinner('Performing computations. Please wait ...'):
|
| 453 |
+
topics_coords, phi, totaltop, perplexity, top_topics = biterm_topic(extype)
|
| 454 |
+
col1, col2 = st.columns([4,6])
|
| 455 |
+
|
| 456 |
+
@st.cache_data(ttl=3600)
|
| 457 |
+
def biterm_map(extype):
|
| 458 |
+
btmvis_coords = tmp.plot_scatter_topics(topics_coords, size_col='size', label_col='label', topic=numvis)
|
| 459 |
+
return btmvis_coords
|
| 460 |
+
|
| 461 |
+
@st.cache_data(ttl=3600)
|
| 462 |
+
def biterm_bar(extype):
|
| 463 |
+
terms_probs = tmp.calc_terms_probs_ratio(phi, topic=numvis, lambda_=1)
|
| 464 |
+
btmvis_probs = tmp.plot_terms(terms_probs, font_size=12)
|
| 465 |
+
return btmvis_probs
|
| 466 |
+
|
| 467 |
+
with col1:
|
| 468 |
+
st.write('Perplexity score: ', perplexity)
|
| 469 |
+
st.write('')
|
| 470 |
+
numvis = st.selectbox(
|
| 471 |
+
'Choose topic',
|
| 472 |
+
(totaltop), on_change=reset_biterm)
|
| 473 |
+
btmvis_coords = biterm_map(extype)
|
| 474 |
+
st.altair_chart(btmvis_coords)
|
| 475 |
+
with col2:
|
| 476 |
+
btmvis_probs = biterm_bar(extype)
|
| 477 |
+
st.altair_chart(btmvis_probs, use_container_width=True)
|
| 478 |
+
|
| 479 |
+
#===download results===#
|
| 480 |
+
resultcsv = top_topics.to_csv().encode("utf-8")
|
| 481 |
+
st.download_button(label = "Download Results", data=resultcsv, file_name="results.csv", mime="text\csv", on_click="ignore")
|
| 482 |
+
|
| 483 |
+
except ValueError as g:
|
| 484 |
+
st.error('🙇♂️ Please raise the number of topics and click submit')
|
| 485 |
+
|
| 486 |
+
except NameError as f:
|
| 487 |
+
st.warning('🖱️ Please click Submit')
|
| 488 |
+
|
| 489 |
+
with tab2:
|
| 490 |
+
st.markdown('**Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013, May 13). A biterm topic model for short texts. Proceedings of the 22nd International Conference on World Wide Web.** https://doi.org/10.1145/2488388.2488514')
|
| 491 |
+
with tab3:
|
| 492 |
+
st.markdown('**Cai, M., Shah, N., Li, J., Chen, W. H., Cuomo, R. E., Obradovich, N., & Mackey, T. K. (2020, August 26). Identification and characterization of tweets related to the 2015 Indiana HIV outbreak: A retrospective infoveillance study. PLOS ONE, 15(8), e0235150.** https://doi.org/10.1371/journal.pone.0235150')
|
| 493 |
+
st.markdown('**Chen, Y., Dong, T., Ban, Q., & Li, Y. (2021). What Concerns Consumers about Hypertension? A Comparison between the Online Health Community and the Q&A Forum. International Journal of Computational Intelligence Systems, 14(1), 734.** https://doi.org/10.2991/ijcis.d.210203.002')
|
| 494 |
+
st.markdown('**George, Crissandra J., "AMBIGUOUS APPALACHIANNESS: A LINGUISTIC AND PERCEPTUAL INVESTIGATION INTO ARC-LABELED PENNSYLVANIA COUNTIES" (2022). Theses and Dissertations-- Linguistics. 48.** https://doi.org/10.13023/etd.2022.217')
|
| 495 |
+
st.markdown('**Li, J., Chen, W. H., Xu, Q., Shah, N., Kohler, J. C., & Mackey, T. K. (2020). Detection of self-reported experiences with corruption on twitter using unsupervised machine learning. Social Sciences & Humanities Open, 2(1), 100060.** https://doi.org/10.1016/j.ssaho.2020.100060')
|
| 496 |
+
with tab4:
|
| 497 |
+
st.subheader(':blue[Biterm]', anchor=False)
|
| 498 |
+
st.text("Click the three dots at the top right then select the desired format.")
|
| 499 |
+
st.markdown("")
|
| 500 |
+
st.divider()
|
| 501 |
+
st.subheader(':blue[Downloading CSV Results]', anchor=False)
|
| 502 |
+
st.button("Download Results")
|
| 503 |
+
st.text("Click Download results button at bottom of page")
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
#===BERTopic===
|
| 507 |
+
elif method == 'BERTopic':
|
| 508 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
| 509 |
+
#@st.cache_data(ttl=3600, show_spinner=False)
|
| 510 |
+
def bertopic_vis(extype):
|
| 511 |
+
umap_model = UMAP(n_neighbors=bert_n_neighbors, n_components=bert_n_components,
|
| 512 |
+
min_dist=0.0, metric='cosine', random_state=bert_random_state)
|
| 513 |
+
cluster_model = KMeans(n_clusters=num_topic)
|
| 514 |
+
if bert_embedding_model == 'all-MiniLM-L6-v2':
|
| 515 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 516 |
+
lang = 'en'
|
| 517 |
+
embeddings = model.encode(topic_abs, show_progress_bar=True)
|
| 518 |
+
|
| 519 |
+
elif bert_embedding_model == 'en_core_web_sm':
|
| 520 |
+
nlp = en_core_web_sm.load(exclude=['tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer'])
|
| 521 |
+
model = nlp
|
| 522 |
+
lang = 'en'
|
| 523 |
+
embeddings = np.array([nlp(text).vector for text in topic_abs])
|
| 524 |
+
|
| 525 |
+
elif bert_embedding_model == 'paraphrase-multilingual-MiniLM-L12-v2':
|
| 526 |
+
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
|
| 527 |
+
lang = 'multilingual'
|
| 528 |
+
embeddings = model.encode(topic_abs, show_progress_bar=True)
|
| 529 |
+
|
| 530 |
+
representation_model = ""
|
| 531 |
+
|
| 532 |
+
if fine_tuning:
|
| 533 |
+
keybert = KeyBERTInspired()
|
| 534 |
+
mmr = MaximalMarginalRelevance(diversity=0.3)
|
| 535 |
+
representation_model = {
|
| 536 |
+
"KeyBERT": keybert,
|
| 537 |
+
"MMR": mmr,
|
| 538 |
+
}
|
| 539 |
+
if topic_labelling:
|
| 540 |
+
if llm_provider == "OpenAI/gpt-4o":
|
| 541 |
+
client = openai.OpenAI(api_key=api_key)
|
| 542 |
+
representation_model = {
|
| 543 |
+
"KeyBERT": keybert,
|
| 544 |
+
"MMR": mmr,
|
| 545 |
+
"test": OpenAI(client, model = "gpt-4o-mini", delay_in_seconds=10)
|
| 546 |
+
}
|
| 547 |
+
elif llm_provider == "Google/flan-t5":
|
| 548 |
+
pipe = pipeline("text2text-generation", model = "google/flan-t5-base")
|
| 549 |
+
clientmod = TextGeneration(pipe)
|
| 550 |
+
representation_model = {
|
| 551 |
+
"KeyBERT": keybert,
|
| 552 |
+
"MMR": mmr,
|
| 553 |
+
"test": clientmod
|
| 554 |
+
}
|
| 555 |
+
elif llm_provider == "LiquidAI/LFM2-350M":
|
| 556 |
+
pipe = pipeline("text-generation", model = "LiquidAI/LFM2-350M")
|
| 557 |
+
clientmod = TextGeneration(pipe)
|
| 558 |
+
representation_model = {
|
| 559 |
+
"KeyBERT": keybert,
|
| 560 |
+
"MMR": mmr,
|
| 561 |
+
"test": clientmod
|
| 562 |
+
}
|
| 563 |
+
|
| 564 |
+
vectorizer_model = CountVectorizer(ngram_range=(1, xgram), stop_words='english')
|
| 565 |
+
topic_model = BERTopic(representation_model = representation_model, embedding_model=model, hdbscan_model=cluster_model, language=lang, umap_model=umap_model, vectorizer_model=vectorizer_model, top_n_words=bert_top_n_words)
|
| 566 |
+
topics, probs = topic_model.fit_transform(topic_abs, embeddings=embeddings)
|
| 567 |
+
|
| 568 |
+
if(fine_tuning and topic_labelling):
|
| 569 |
+
generated_labels = [label[0][0].split("\n")[0] for label in topic_model.get_topics(full=True)["test"].values()]
|
| 570 |
+
topic_model.set_topic_labels(generated_labels)
|
| 571 |
+
|
| 572 |
+
return topic_model, topics, probs, embeddings
|
| 573 |
+
|
| 574 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
| 575 |
+
def Vis_Topics(extype):
|
| 576 |
+
fig1 = topic_model.visualize_topics(custom_labels = True)
|
| 577 |
+
return fig1
|
| 578 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
| 579 |
+
def Vis_Documents(extype):
|
| 580 |
+
fig2 = topic_model.visualize_document_datamap(topic_abs, embeddings=embeddings, custom_labels = True)
|
| 581 |
+
return fig2
|
| 582 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
| 583 |
+
def Vis_Hierarchy(extype):
|
| 584 |
+
fig3 = topic_model.visualize_hierarchy(top_n_topics=num_topic, custom_labels = True)
|
| 585 |
+
return fig3
|
| 586 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
| 587 |
+
def Vis_Heatmap(extype):
|
| 588 |
+
global topic_model
|
| 589 |
+
fig4 = topic_model.visualize_heatmap(n_clusters=num_topic-1, width=1000, height=1000, custom_labels = True)
|
| 590 |
+
return fig4
|
| 591 |
+
@st.cache_resource(ttl = 3600, show_spinner=False)
|
| 592 |
+
def Vis_Barchart(extype):
|
| 593 |
+
fig5 = topic_model.visualize_barchart(top_n_topics=num_topic, custom_labels = True)
|
| 594 |
+
return fig5
|
| 595 |
+
|
| 596 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 597 |
+
with tab1:
|
| 598 |
+
try:
|
| 599 |
+
with st.spinner('Performing computations. Please wait ...'):
|
| 600 |
+
|
| 601 |
+
topic_model, topics, probs, embeddings = bertopic_vis(extype)
|
| 602 |
+
time.sleep(.5)
|
| 603 |
+
st.toast('Visualize Topics', icon='🏃')
|
| 604 |
+
fig1 = Vis_Topics(extype)
|
| 605 |
+
|
| 606 |
+
time.sleep(.5)
|
| 607 |
+
st.toast('Visualize Document', icon='🏃')
|
| 608 |
+
fig2 = Vis_Documents(extype)
|
| 609 |
+
|
| 610 |
+
time.sleep(.5)
|
| 611 |
+
st.toast('Visualize Document Hierarchy', icon='🏃')
|
| 612 |
+
fig3 = Vis_Hierarchy(extype)
|
| 613 |
+
|
| 614 |
+
time.sleep(.5)
|
| 615 |
+
st.toast('Visualize Topic Similarity', icon='🏃')
|
| 616 |
+
fig4 = Vis_Heatmap(extype)
|
| 617 |
+
|
| 618 |
+
time.sleep(.5)
|
| 619 |
+
st.toast('Visualize Terms', icon='🏃')
|
| 620 |
+
fig5 = Vis_Barchart(extype)
|
| 621 |
+
|
| 622 |
+
bertab1, bertab2, bertab3, bertab4, bertab5 = st.tabs(["Visualize Topics", "Visualize Terms", "Visualize Documents",
|
| 623 |
+
"Visualize Document Hierarchy", "Visualize Topic Similarity"])
|
| 624 |
+
|
| 625 |
+
with bertab1:
|
| 626 |
+
st.plotly_chart(fig1, use_container_width=True)
|
| 627 |
+
with bertab2:
|
| 628 |
+
st.plotly_chart(fig5, use_container_width=True)
|
| 629 |
+
with bertab3:
|
| 630 |
+
st.plotly_chart(fig2, use_container_width=True)
|
| 631 |
+
with bertab4:
|
| 632 |
+
st.plotly_chart(fig3, use_container_width=True)
|
| 633 |
+
with bertab5:
|
| 634 |
+
st.plotly_chart(fig4, use_container_width=True)
|
| 635 |
+
|
| 636 |
+
#===download results===#
|
| 637 |
+
results = topic_model.get_topic_info()
|
| 638 |
+
resultf = pd.DataFrame(results)
|
| 639 |
+
resultcsv = resultf.to_csv().encode("utf-8")
|
| 640 |
+
st.download_button(
|
| 641 |
+
label = "Download Results",
|
| 642 |
+
data=resultcsv,
|
| 643 |
+
file_name="results.csv",
|
| 644 |
+
mime="text\csv",
|
| 645 |
+
on_click="ignore",
|
| 646 |
+
)
|
| 647 |
+
|
| 648 |
+
except ValueError:
|
| 649 |
+
st.error('🙇♂️ Please raise the number of topics and click submit')
|
| 650 |
+
|
| 651 |
+
|
| 652 |
+
except NameError:
|
| 653 |
+
st.warning('🖱️ Please click Submit')
|
| 654 |
+
|
| 655 |
+
with tab2:
|
| 656 |
+
st.markdown('**Grootendorst, M. (2022). BERTopic: Neural topic modeling with a class-based TF-IDF procedure. arXiv preprint arXiv:2203.05794.** https://doi.org/10.48550/arXiv.2203.05794')
|
| 657 |
+
|
| 658 |
+
with tab3:
|
| 659 |
+
st.markdown('**Jeet Rawat, A., Ghildiyal, S., & Dixit, A. K. (2022, December 1). Topic modelling of legal documents using NLP and bidirectional encoder representations from transformers. Indonesian Journal of Electrical Engineering and Computer Science, 28(3), 1749.** https://doi.org/10.11591/ijeecs.v28.i3.pp1749-1755')
|
| 660 |
+
st.markdown('**Yao, L. F., Ferawati, K., Liew, K., Wakamiya, S., & Aramaki, E. (2023, April 20). Disruptions in the Cystic Fibrosis Community’s Experiences and Concerns During the COVID-19 Pandemic: Topic Modeling and Time Series Analysis of Reddit Comments. Journal of Medical Internet Research, 25, e45249.** https://doi.org/10.2196/45249')
|
| 661 |
+
|
| 662 |
+
with tab4:
|
| 663 |
+
st.divider()
|
| 664 |
+
st.subheader(':blue[BERTopic]', anchor=False)
|
| 665 |
+
st.text("Click the camera icon on the top right menu")
|
| 666 |
+
st.markdown("")
|
| 667 |
+
st.divider()
|
| 668 |
+
st.subheader(':blue[Downloading CSV Results]', anchor=False)
|
| 669 |
+
st.button("Download Results", on_click="ignore")
|
| 670 |
+
st.text("Click Download results button at bottom of page")
|
| 671 |
+
|
| 672 |
+
except:
|
| 673 |
+
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 674 |
+
st.stop()
|
pages/3 Bidirected Network.py
CHANGED
|
@@ -235,16 +235,16 @@ if uploaded_file is not None:
|
|
| 235 |
col1, col2, col3 = st.columns(3)
|
| 236 |
with col1:
|
| 237 |
supp = st.slider(
|
| 238 |
-
'
|
| 239 |
-
0.001, 1.000, (0.010), on_change=reset_all)
|
| 240 |
with col2:
|
| 241 |
conf = st.slider(
|
| 242 |
-
'
|
| 243 |
-
0.001, 1.000, (0.050), on_change=reset_all)
|
| 244 |
with col3:
|
| 245 |
maxlen = st.slider(
|
| 246 |
'Maximum length of the itemsets generated',
|
| 247 |
-
2, 8, (2), on_change=reset_all)
|
| 248 |
|
| 249 |
tab1, tab2, tab3, tab4 = st.tabs(["📈 Result & Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 250 |
|
|
@@ -278,7 +278,7 @@ if uploaded_file is not None:
|
|
| 278 |
st.error('Please lower your value.', icon="🚨")
|
| 279 |
else:
|
| 280 |
restab = arm_table(extype)
|
| 281 |
-
restab = st.data_editor(restab, use_container_width=True)
|
| 282 |
res = restab[restab['Show'] == True]
|
| 283 |
|
| 284 |
#===visualize===
|
|
@@ -400,7 +400,7 @@ if uploaded_file is not None:
|
|
| 400 |
st.markdown("")
|
| 401 |
st.subheader("Download table as CSV")
|
| 402 |
st.text("Hover cursor over table, and click download arrow")
|
| 403 |
-
st.
|
| 404 |
|
| 405 |
except:
|
| 406 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
|
|
|
| 235 |
col1, col2, col3 = st.columns(3)
|
| 236 |
with col1:
|
| 237 |
supp = st.slider(
|
| 238 |
+
'Support',
|
| 239 |
+
0.001, 1.000, (0.010), on_change=reset_all, help='Frequency of occurrence of keywords in a set of documents')
|
| 240 |
with col2:
|
| 241 |
conf = st.slider(
|
| 242 |
+
'Confidence',
|
| 243 |
+
0.001, 1.000, (0.050), on_change=reset_all, help='Presence of keywords in documents that included the antecedents')
|
| 244 |
with col3:
|
| 245 |
maxlen = st.slider(
|
| 246 |
'Maximum length of the itemsets generated',
|
| 247 |
+
2, 8, (2), on_change=reset_all, help='')
|
| 248 |
|
| 249 |
tab1, tab2, tab3, tab4 = st.tabs(["📈 Result & Generate visualization", "📃 Reference", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 250 |
|
|
|
|
| 278 |
st.error('Please lower your value.', icon="🚨")
|
| 279 |
else:
|
| 280 |
restab = arm_table(extype)
|
| 281 |
+
restab = st.data_editor(restab, use_container_width=True, hide_index=True)
|
| 282 |
res = restab[restab['Show'] == True]
|
| 283 |
|
| 284 |
#===visualize===
|
|
|
|
| 400 |
st.markdown("")
|
| 401 |
st.subheader("Download table as CSV")
|
| 402 |
st.text("Hover cursor over table, and click download arrow")
|
| 403 |
+
st.markdown("")
|
| 404 |
|
| 405 |
except:
|
| 406 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
pages/4 Sunburst.py
CHANGED
|
@@ -4,7 +4,6 @@ import pandas as pd
|
|
| 4 |
import plotly.express as px
|
| 5 |
import numpy as np
|
| 6 |
import sys
|
| 7 |
-
import json
|
| 8 |
from tools import sourceformat as sf
|
| 9 |
|
| 10 |
|
|
@@ -94,6 +93,7 @@ def conv_txt(extype):
|
|
| 94 |
print(papers)
|
| 95 |
return papers
|
| 96 |
|
|
|
|
| 97 |
@st.cache_data(ttl=3600)
|
| 98 |
def conv_json(extype):
|
| 99 |
col_dict={'title': 'title',
|
|
@@ -101,11 +101,7 @@ def conv_json(extype):
|
|
| 101 |
'content_provider_code': 'Document Type',
|
| 102 |
'Keywords':'Source title'
|
| 103 |
}
|
| 104 |
-
|
| 105 |
-
data = json.load(uploaded_file)
|
| 106 |
-
hathifile = data['gathers']
|
| 107 |
-
keywords = pd.DataFrame.from_records(hathifile)
|
| 108 |
-
|
| 109 |
keywords = sf.htrc(keywords)
|
| 110 |
keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
|
| 111 |
keywords.rename(columns=col_dict,inplace=True)
|
|
@@ -146,37 +142,46 @@ if uploaded_file is not None:
|
|
| 146 |
MIN1 = int(papers['Cited by'].min())
|
| 147 |
MAX1 = int(papers['Cited by'].max())
|
| 148 |
GAP = MAX - MIN
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 152 |
|
| 153 |
with tab1:
|
| 154 |
#===sunburst===
|
| 155 |
try:
|
| 156 |
-
papers, MIN, MAX, GAP, MIN1, MAX1 = get_minmax(extype)
|
| 157 |
except KeyError:
|
| 158 |
st.error('Error: Please check again your columns.')
|
| 159 |
sys.exit(1)
|
|
|
|
|
|
|
| 160 |
|
| 161 |
if (GAP != 0):
|
| 162 |
-
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX)
|
| 163 |
-
KEYLIM = st.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1)
|
| 164 |
with st.expander("Filtering setings"):
|
| 165 |
-
invert_keys = st.toggle("Invert keys"
|
| 166 |
-
filtered_keys = st.text_input("Filter words in source, seperate with semicolon (;)", value = ""
|
| 167 |
keylist = filtered_keys.split(";")
|
| 168 |
select_col = st.selectbox("Column to filter from", (list(papers)))
|
| 169 |
else:
|
| 170 |
-
|
| 171 |
YEAR = (MIN, MAX)
|
| 172 |
-
KEYLIM = (MIN1,MAX1)
|
|
|
|
| 173 |
@st.cache_data(ttl=3600)
|
| 174 |
def listyear(extype):
|
| 175 |
global papers
|
| 176 |
years = list(range(YEAR[0],YEAR[1]+1))
|
| 177 |
cited = list(range(KEYLIM[0],KEYLIM[1]+1))
|
|
|
|
|
|
|
| 178 |
papers = papers.loc[papers['Year'].isin(years)]
|
| 179 |
papers = papers.loc[papers['Cited by'].isin(cited)]
|
|
|
|
| 180 |
return years, papers
|
| 181 |
|
| 182 |
@st.cache_data(ttl=3600)
|
|
@@ -186,9 +191,9 @@ if uploaded_file is not None:
|
|
| 186 |
|
| 187 |
#filtering
|
| 188 |
if(invert_keys):
|
| 189 |
-
data = data[data[select_col].
|
| 190 |
else:
|
| 191 |
-
data = data[~data[select_col].
|
| 192 |
|
| 193 |
vis = pd.DataFrame()
|
| 194 |
vis[['doctype','source','citby','year']] = data[['Document Type','Source title','Cited by','Year']]
|
|
@@ -200,14 +205,14 @@ if uploaded_file is not None:
|
|
| 200 |
color_continuous_scale='RdBu',
|
| 201 |
color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
|
| 202 |
fig.update_layout(height=800, width=1200)
|
| 203 |
-
return fig
|
| 204 |
|
| 205 |
years, papers = listyear(extype)
|
| 206 |
|
| 207 |
|
| 208 |
if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
|
| 209 |
|
| 210 |
-
if st.button("Submit"):
|
| 211 |
fig, viz = vis_sunbrust(extype)
|
| 212 |
st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
|
| 213 |
st.dataframe(viz)
|
|
@@ -222,6 +227,7 @@ if uploaded_file is not None:
|
|
| 222 |
with tab3:
|
| 223 |
st.text("Click the camera icon on the top right menu (you may need to hover your cursor within the visualization)")
|
| 224 |
st.markdown("")
|
| 225 |
-
except:
|
|
|
|
| 226 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 227 |
st.stop()
|
|
|
|
| 4 |
import plotly.express as px
|
| 5 |
import numpy as np
|
| 6 |
import sys
|
|
|
|
| 7 |
from tools import sourceformat as sf
|
| 8 |
|
| 9 |
|
|
|
|
| 93 |
print(papers)
|
| 94 |
return papers
|
| 95 |
|
| 96 |
+
|
| 97 |
@st.cache_data(ttl=3600)
|
| 98 |
def conv_json(extype):
|
| 99 |
col_dict={'title': 'title',
|
|
|
|
| 101 |
'content_provider_code': 'Document Type',
|
| 102 |
'Keywords':'Source title'
|
| 103 |
}
|
| 104 |
+
keywords = pd.read_json(uploaded_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
keywords = sf.htrc(keywords)
|
| 106 |
keywords['Cited by'] = keywords.groupby(['Keywords'])['Keywords'].transform('size')
|
| 107 |
keywords.rename(columns=col_dict,inplace=True)
|
|
|
|
| 142 |
MIN1 = int(papers['Cited by'].min())
|
| 143 |
MAX1 = int(papers['Cited by'].max())
|
| 144 |
GAP = MAX - MIN
|
| 145 |
+
unique_stitle = set()
|
| 146 |
+
unique_stitle.update(papers['Source title'].dropna())
|
| 147 |
+
list_stitle = sorted(list(unique_stitle))
|
| 148 |
+
return papers, MIN, MAX, GAP, MIN1, MAX1, list_stitle
|
| 149 |
|
| 150 |
tab1, tab2, tab3 = st.tabs(["📈 Generate visualization", "📓 Recommended Reading", "⬇️ Download Help"])
|
| 151 |
|
| 152 |
with tab1:
|
| 153 |
#===sunburst===
|
| 154 |
try:
|
| 155 |
+
papers, MIN, MAX, GAP, MIN1, MAX1, list_stitle = get_minmax(extype)
|
| 156 |
except KeyError:
|
| 157 |
st.error('Error: Please check again your columns.')
|
| 158 |
sys.exit(1)
|
| 159 |
+
|
| 160 |
+
stitle = st.selectbox('Focus on', (list_stitle), index=None, on_change=reset_all)
|
| 161 |
|
| 162 |
if (GAP != 0):
|
| 163 |
+
YEAR = st.slider('Year', min_value=MIN, max_value=MAX, value=(MIN, MAX))
|
| 164 |
+
KEYLIM = st.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1))
|
| 165 |
with st.expander("Filtering setings"):
|
| 166 |
+
invert_keys = st.toggle("Invert keys")
|
| 167 |
+
filtered_keys = st.text_input("Filter words in source, seperate with semicolon (;)", value = "\n")
|
| 168 |
keylist = filtered_keys.split(";")
|
| 169 |
select_col = st.selectbox("Column to filter from", (list(papers)))
|
| 170 |
else:
|
| 171 |
+
col1.write('You only have data in ', (MAX))
|
| 172 |
YEAR = (MIN, MAX)
|
| 173 |
+
KEYLIM = col2.slider('Cited By Count',min_value = MIN1, max_value = MAX1, value = (MIN1,MAX1), on_change=reset_all)
|
| 174 |
+
|
| 175 |
@st.cache_data(ttl=3600)
|
| 176 |
def listyear(extype):
|
| 177 |
global papers
|
| 178 |
years = list(range(YEAR[0],YEAR[1]+1))
|
| 179 |
cited = list(range(KEYLIM[0],KEYLIM[1]+1))
|
| 180 |
+
if stitle:
|
| 181 |
+
papers = papers[papers['Source title'].str.contains(stitle, case=False, na=False)]
|
| 182 |
papers = papers.loc[papers['Year'].isin(years)]
|
| 183 |
papers = papers.loc[papers['Cited by'].isin(cited)]
|
| 184 |
+
papers['Cited by'] = papers['Cited by'].fillna(0)
|
| 185 |
return years, papers
|
| 186 |
|
| 187 |
@st.cache_data(ttl=3600)
|
|
|
|
| 191 |
|
| 192 |
#filtering
|
| 193 |
if(invert_keys):
|
| 194 |
+
data = data[data[select_col].str.contains('|'.join(keylist), na=False)]
|
| 195 |
else:
|
| 196 |
+
data = data[~data[select_col].str.contains('|'.join(keylist), na=False)]
|
| 197 |
|
| 198 |
vis = pd.DataFrame()
|
| 199 |
vis[['doctype','source','citby','year']] = data[['Document Type','Source title','Cited by','Year']]
|
|
|
|
| 205 |
color_continuous_scale='RdBu',
|
| 206 |
color_continuous_midpoint=np.average(viz['cited by'], weights=viz['total docs']))
|
| 207 |
fig.update_layout(height=800, width=1200)
|
| 208 |
+
return fig
|
| 209 |
|
| 210 |
years, papers = listyear(extype)
|
| 211 |
|
| 212 |
|
| 213 |
if {'Document Type','Source title','Cited by','Year'}.issubset(papers.columns):
|
| 214 |
|
| 215 |
+
if st.button("Submit", on_click = reset_all):
|
| 216 |
fig, viz = vis_sunbrust(extype)
|
| 217 |
st.plotly_chart(fig, height=800, width=1200) #use_container_width=True)
|
| 218 |
st.dataframe(viz)
|
|
|
|
| 227 |
with tab3:
|
| 228 |
st.text("Click the camera icon on the top right menu (you may need to hover your cursor within the visualization)")
|
| 229 |
st.markdown("")
|
| 230 |
+
except Exception as e:
|
| 231 |
+
st.write(e)
|
| 232 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 233 |
st.stop()
|
pages/5 Burst Detection.py
CHANGED
|
@@ -364,84 +364,110 @@ def scattervis(bursts, freq_data, top_n):
|
|
| 364 |
autosize=False
|
| 365 |
)
|
| 366 |
|
| 367 |
-
fig.write_image("scatter_plot.png")
|
| 368 |
-
st.image("scatter_plot.png")
|
| 369 |
-
|
|
|
|
| 370 |
|
| 371 |
@st.cache_data(ttl=3600)
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
row, col = 1, 1
|
| 376 |
for i, column in enumerate(freq_data.columns[:top_n]):
|
|
|
|
| 377 |
fig.add_trace(go.Scatter(
|
| 378 |
-
x=
|
|
|
|
|
|
|
|
|
|
| 379 |
line_shape='linear',
|
| 380 |
hoverinfo='text',
|
| 381 |
-
hovertext=[f"Year: {
|
| 382 |
-
|
|
|
|
| 383 |
textposition='top center'
|
| 384 |
), row=row, col=col)
|
| 385 |
-
|
|
|
|
| 386 |
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
# Add area chart
|
| 394 |
fig.add_trace(go.Scatter(
|
| 395 |
-
x=
|
| 396 |
-
y=
|
| 397 |
-
fill='tozeroy',
|
|
|
|
|
|
|
|
|
|
| 398 |
), row=row, col=col)
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
align_value = "left" if running_total == "Running total" else "center"
|
| 401 |
valign_value = "bottom" if running_total == "Running total" else "middle"
|
| 402 |
-
|
| 403 |
-
# Add annotation for weight at the bottom
|
| 404 |
fig.add_annotation(
|
| 405 |
-
x=
|
| 406 |
y=y_post,
|
| 407 |
text=f"Weight: {row_data['weight']:.2f}",
|
| 408 |
showarrow=False,
|
| 409 |
-
font=dict(
|
| 410 |
-
color="black",
|
| 411 |
-
size=12),
|
| 412 |
align=align_value,
|
| 413 |
valign=valign_value,
|
| 414 |
textangle=270,
|
| 415 |
row=row, col=col
|
| 416 |
)
|
| 417 |
-
|
| 418 |
-
# Add labels for values only in bursts
|
| 419 |
-
fig.add_trace(go.Scatter(
|
| 420 |
-
x=x_values, y=y_values, mode='lines+markers+text', name=column,
|
| 421 |
-
line_shape='linear',
|
| 422 |
-
hoverinfo='text',
|
| 423 |
-
hovertext=[f"Year: {index}<br>Frequency: {freq}" for index, freq in zip(freq_data.index, freq_data[column])],
|
| 424 |
-
text=y_values,
|
| 425 |
-
textposition='top center'
|
| 426 |
-
), row=row, col=col)
|
| 427 |
-
print(freq_data[column])
|
| 428 |
-
|
| 429 |
|
| 430 |
col += 1
|
| 431 |
if col > 2:
|
| 432 |
col = 1
|
| 433 |
row += 1
|
| 434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 435 |
fig.update_layout(
|
| 436 |
showlegend=False,
|
| 437 |
margin=dict(l=20, r=20, t=100, b=20),
|
| 438 |
-
height=
|
| 439 |
-
width=
|
|
|
|
| 440 |
)
|
| 441 |
|
| 442 |
-
fig.write_image("line_graph.png")
|
| 443 |
-
|
| 444 |
-
|
|
|
|
| 445 |
|
| 446 |
@st.cache_data(ttl=3600)
|
| 447 |
def download_result(freq_data, bursts):
|
|
@@ -496,7 +522,7 @@ if uploaded_file is not None:
|
|
| 496 |
st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
|
| 497 |
|
| 498 |
if viz_selected == "Line graph":
|
| 499 |
-
linegraph(bursts, freq_data)
|
| 500 |
|
| 501 |
elif viz_selected =="Scatter plot":
|
| 502 |
scattervis(bursts, freq_data, top_n)
|
|
@@ -546,6 +572,7 @@ if uploaded_file is not None:
|
|
| 546 |
st.button('👉 Press to download the list of detected bursts', on_click=None)
|
| 547 |
st.text("Click download button.")
|
| 548 |
|
| 549 |
-
except:
|
|
|
|
| 550 |
st.error("Please ensure that your file or settings are correct. If you think there is a mistake, feel free to reach out to us!", icon="🚨")
|
| 551 |
st.stop()
|
|
|
|
| 364 |
autosize=False
|
| 365 |
)
|
| 366 |
|
| 367 |
+
fig.write_image("scatter_plot.png", width=plot_width, height=plot_height)
|
| 368 |
+
st.image("scatter_plot.png", use_column_width=False)
|
| 369 |
+
|
| 370 |
+
pio.write_image(fig, 'result.png', width=plot_width, height=plot_height, scale=4)
|
| 371 |
|
| 372 |
@st.cache_data(ttl=3600)
|
| 373 |
+
@st.cache_data(ttl=3600)
|
| 374 |
+
def linegraph(bursts, freq_data, top_n, running_total=""):
|
| 375 |
+
num_rows = (top_n + 1) // 2 # 2 columns layout
|
| 376 |
+
|
| 377 |
+
# --- X spacing: each year gets a slot of width 10 (=> ±5 padding) ---
|
| 378 |
+
years = list(freq_data.index)
|
| 379 |
+
spacing = 100
|
| 380 |
+
padding = 200
|
| 381 |
+
x_positions = np.arange(len(years)) * spacing # 0,10,20,...
|
| 382 |
+
tickvals = x_positions
|
| 383 |
+
ticktext = [str(y) for y in years]
|
| 384 |
+
|
| 385 |
+
fig = make_subplots(
|
| 386 |
+
rows=num_rows,
|
| 387 |
+
cols=2,
|
| 388 |
+
subplot_titles=freq_data.columns[:top_n]
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
row, col = 1, 1
|
| 392 |
for i, column in enumerate(freq_data.columns[:top_n]):
|
| 393 |
+
# main line (x mapped to spaced positions)
|
| 394 |
fig.add_trace(go.Scatter(
|
| 395 |
+
x=x_positions,
|
| 396 |
+
y=freq_data[column].to_numpy(),
|
| 397 |
+
mode='lines+markers+text',
|
| 398 |
+
name=column,
|
| 399 |
line_shape='linear',
|
| 400 |
hoverinfo='text',
|
| 401 |
+
hovertext=[f"Year: {yr}<br>Frequency: {freq}"
|
| 402 |
+
for yr, freq in zip(years, freq_data[column])],
|
| 403 |
+
text=freq_data[column],
|
| 404 |
textposition='top center'
|
| 405 |
), row=row, col=col)
|
| 406 |
+
|
| 407 |
+
# bursts shading + annotation
|
| 408 |
for _, row_data in bursts[bursts['label'] == column].iterrows():
|
| 409 |
+
# slice by positional indices (begin/end are positions)
|
| 410 |
+
x_vals = x_positions[row_data['begin']:row_data['end'] + 1]
|
| 411 |
+
y_vals = freq_data[column].iloc[row_data['begin']:row_data['end'] + 1].to_numpy()
|
| 412 |
+
|
| 413 |
+
# area under the line during burst
|
|
|
|
|
|
|
| 414 |
fig.add_trace(go.Scatter(
|
| 415 |
+
x=x_vals,
|
| 416 |
+
y=y_vals,
|
| 417 |
+
fill='tozeroy',
|
| 418 |
+
mode='lines',
|
| 419 |
+
fillcolor='rgba(0,100,80,0.2)',
|
| 420 |
+
line=dict(width=0) # keep it as a filled area
|
| 421 |
), row=row, col=col)
|
| 422 |
+
|
| 423 |
+
# weight label near the bottom
|
| 424 |
+
y_post = float(np.nanmin(freq_data[column])) * 0.95
|
| 425 |
+
x_offset = 0.5 # small shift within the 10-wide slot
|
| 426 |
+
|
| 427 |
align_value = "left" if running_total == "Running total" else "center"
|
| 428 |
valign_value = "bottom" if running_total == "Running total" else "middle"
|
| 429 |
+
|
|
|
|
| 430 |
fig.add_annotation(
|
| 431 |
+
x=x_vals[0] + x_offset,
|
| 432 |
y=y_post,
|
| 433 |
text=f"Weight: {row_data['weight']:.2f}",
|
| 434 |
showarrow=False,
|
| 435 |
+
font=dict(color="black", size=12),
|
|
|
|
|
|
|
| 436 |
align=align_value,
|
| 437 |
valign=valign_value,
|
| 438 |
textangle=270,
|
| 439 |
row=row, col=col
|
| 440 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
col += 1
|
| 443 |
if col > 2:
|
| 444 |
col = 1
|
| 445 |
row += 1
|
| 446 |
+
|
| 447 |
+
# Dynamic sizing
|
| 448 |
+
plot_height = num_rows * 500
|
| 449 |
+
plot_width = len(years) * spacing + padding
|
| 450 |
+
|
| 451 |
+
# Apply the same x settings to all subplots:
|
| 452 |
+
fig.update_xaxes(
|
| 453 |
+
range=[-spacing/2, x_positions[-1] + spacing/2], # ±5 around the ends
|
| 454 |
+
tickmode='array',
|
| 455 |
+
tickvals=tickvals,
|
| 456 |
+
ticktext=ticktext
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
fig.update_layout(
|
| 460 |
showlegend=False,
|
| 461 |
margin=dict(l=20, r=20, t=100, b=20),
|
| 462 |
+
height=plot_height,
|
| 463 |
+
width=plot_width,
|
| 464 |
+
autosize=False
|
| 465 |
)
|
| 466 |
|
| 467 |
+
fig.write_image("line_graph.png", width=plot_width, height=plot_height)
|
| 468 |
+
|
| 469 |
+
st.image("line_graph.png", use_column_width=False)
|
| 470 |
+
pio.write_image(fig, 'result.png', width=plot_width, height=plot_height, scale=4)
|
| 471 |
|
| 472 |
@st.cache_data(ttl=3600)
|
| 473 |
def download_result(freq_data, bursts):
|
|
|
|
| 522 |
st.info(f'We only detect a burst on {num_unique_labels} word(s), which is {top_n - num_unique_labels} fewer than the top word(s)', icon="ℹ️")
|
| 523 |
|
| 524 |
if viz_selected == "Line graph":
|
| 525 |
+
linegraph(bursts, freq_data, top_n)
|
| 526 |
|
| 527 |
elif viz_selected =="Scatter plot":
|
| 528 |
scattervis(bursts, freq_data, top_n)
|
|
|
|
| 572 |
st.button('👉 Press to download the list of detected bursts', on_click=None)
|
| 573 |
st.text("Click download button.")
|
| 574 |
|
| 575 |
+
except Exception as e:
|
| 576 |
+
st.write(e)
|
| 577 |
st.error("Please ensure that your file or settings are correct. If you think there is a mistake, feel free to reach out to us!", icon="🚨")
|
| 578 |
st.stop()
|
pages/6 Keywords Stem.py
CHANGED
|
@@ -291,10 +291,16 @@ if uploaded_file is not None:
|
|
| 291 |
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
|
| 292 |
|
| 293 |
with tab5:
|
| 294 |
-
st.
|
|
|
|
|
|
|
|
|
|
| 295 |
st.divider()
|
| 296 |
-
st.
|
| 297 |
-
st.
|
|
|
|
|
|
|
| 298 |
except:
|
| 299 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 300 |
st.stop()
|
|
|
|
|
|
| 291 |
st.markdown('**Lamba, M., & Madhusudhan, M. (2021, July 31). Text Pre-Processing. Text Mining for Information Professionals, 79–103.** https://doi.org/10.1007/978-3-030-85085-2_3')
|
| 292 |
|
| 293 |
with tab5:
|
| 294 |
+
st.subheader(':blue[Result]', anchor=False)
|
| 295 |
+
st.button('Press to download result 👈')
|
| 296 |
+
st.text("Go to Result and click Download button.")
|
| 297 |
+
|
| 298 |
st.divider()
|
| 299 |
+
st.subheader(':blue[List of Keywords]', anchor=False)
|
| 300 |
+
st.button('Press to download keywords 👈')
|
| 301 |
+
st.text("Go to List of Keywords and click Download button.")
|
| 302 |
+
|
| 303 |
except:
|
| 304 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 305 |
st.stop()
|
| 306 |
+
|
pages/7 Sentiment Analysis.py
CHANGED
|
@@ -131,7 +131,7 @@ def conv_json(extype):
|
|
| 131 |
keywords.rename(columns=col_dict,inplace=True)
|
| 132 |
return keywords
|
| 133 |
|
| 134 |
-
@st.
|
| 135 |
def conv_pub(extype):
|
| 136 |
if (get_ext(extype)).endswith('.tar.gz'):
|
| 137 |
bytedata = extype.read()
|
|
@@ -168,8 +168,8 @@ if uploaded_file is not None:
|
|
| 168 |
'Choose method',[
|
| 169 |
'TextBlob','NLTKvader']
|
| 170 |
)
|
| 171 |
-
words_to_remove = c1.text_input("Remove specific words. Separate words by semicolons (;)")
|
| 172 |
-
wordcount = c2.number_input(label = "Words displayed", min_value = 1, step = 1, value=5)-1
|
| 173 |
rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
| 174 |
rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
|
| 175 |
|
|
@@ -191,17 +191,23 @@ if uploaded_file is not None:
|
|
| 191 |
paper[ColCho] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
| 192 |
|
| 193 |
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
@st.cache_resource(ttl=3600)
|
| 197 |
def remove_words(text):
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
paper['Sentences__'] = paper['Abstract_pre'].map(remove_words)
|
| 203 |
-
|
| 204 |
return paper
|
|
|
|
| 205 |
paper=clean_csv(extype)
|
| 206 |
|
| 207 |
if method == 'NLTKvader':
|
|
@@ -266,11 +272,11 @@ if uploaded_file is not None:
|
|
| 266 |
|
| 267 |
return phrase, phrasepolar, phrasesubject
|
| 268 |
|
| 269 |
-
@st.
|
| 270 |
def mergelist(data):
|
| 271 |
return ' '.join(data)
|
| 272 |
|
| 273 |
-
@st.
|
| 274 |
def assignscore(data):
|
| 275 |
if data>0:
|
| 276 |
return "Positive"
|
|
@@ -357,3 +363,4 @@ if uploaded_file is not None:
|
|
| 357 |
except:
|
| 358 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 359 |
st.stop()
|
|
|
|
|
|
| 131 |
keywords.rename(columns=col_dict,inplace=True)
|
| 132 |
return keywords
|
| 133 |
|
| 134 |
+
@st.cache_data(ttl=3600)
|
| 135 |
def conv_pub(extype):
|
| 136 |
if (get_ext(extype)).endswith('.tar.gz'):
|
| 137 |
bytedata = extype.read()
|
|
|
|
| 168 |
'Choose method',[
|
| 169 |
'TextBlob','NLTKvader']
|
| 170 |
)
|
| 171 |
+
words_to_remove = c1.text_input("Remove specific words. Separate words by semicolons (;)", on_change=reset_all)
|
| 172 |
+
wordcount = c2.number_input(label = "Words displayed", min_value = 1, step = 1, value=5, on_change=reset_all)-1
|
| 173 |
rem_copyright = c1.toggle('Remove copyright statement', value=True, on_change=reset_all)
|
| 174 |
rem_punc = c2.toggle('Remove punctuation', value=True, on_change=reset_all)
|
| 175 |
|
|
|
|
| 191 |
paper[ColCho] = paper['Abstract_pre'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
|
| 192 |
|
| 193 |
words_rmv = [word.strip() for word in words_to_remove.split(";")]
|
| 194 |
+
|
| 195 |
+
@st.cache_data(ttl=3600)
|
|
|
|
| 196 |
def remove_words(text):
|
| 197 |
+
if not isinstance(text, str):
|
| 198 |
+
return text # skip NaN or non-string
|
| 199 |
+
|
| 200 |
+
# Regex pattern: remove exact whole words in words_rmv
|
| 201 |
+
pattern = r'\b(?:' + "|".join(map(re.escape, words_rmv)) + r')\b'
|
| 202 |
+
cleaned_text = re.sub(pattern, '', text)
|
| 203 |
+
|
| 204 |
+
# Remove double spaces created after removal
|
| 205 |
+
return " ".join(cleaned_text.split())
|
| 206 |
|
| 207 |
paper['Sentences__'] = paper['Abstract_pre'].map(remove_words)
|
| 208 |
+
|
| 209 |
return paper
|
| 210 |
+
|
| 211 |
paper=clean_csv(extype)
|
| 212 |
|
| 213 |
if method == 'NLTKvader':
|
|
|
|
| 272 |
|
| 273 |
return phrase, phrasepolar, phrasesubject
|
| 274 |
|
| 275 |
+
@st.cache_data(ttl=3600)
|
| 276 |
def mergelist(data):
|
| 277 |
return ' '.join(data)
|
| 278 |
|
| 279 |
+
@st.cache_data(ttl=3600)
|
| 280 |
def assignscore(data):
|
| 281 |
if data>0:
|
| 282 |
return "Positive"
|
|
|
|
| 363 |
except:
|
| 364 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
| 365 |
st.stop()
|
| 366 |
+
|
pages/8 Shifterator.py
CHANGED
|
@@ -399,8 +399,9 @@ if uploaded_file is not None:
|
|
| 399 |
'Choose column',
|
| 400 |
(df_col_sel), on_change=reset_all)
|
| 401 |
|
| 402 |
-
list_words = paper[column_selected].
|
| 403 |
-
|
|
|
|
| 404 |
|
| 405 |
if column_selected is not None:
|
| 406 |
label1 = col2.selectbox(
|
|
@@ -513,13 +514,13 @@ if uploaded_file is not None:
|
|
| 513 |
|
| 514 |
with tab4:
|
| 515 |
st.subheader(':blue[Result]', anchor=False)
|
| 516 |
-
st.button('📥 Download Graph')
|
| 517 |
st.text("Click Download Graph button.")
|
| 518 |
|
| 519 |
st.divider()
|
| 520 |
st.subheader(':blue[Shifterator Dataframe]', anchor=False)
|
| 521 |
-
st.button('📥
|
| 522 |
-
st.text("Click the Download button to get the CSV result.")
|
| 523 |
|
| 524 |
except:
|
| 525 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|
|
|
|
| 399 |
'Choose column',
|
| 400 |
(df_col_sel), on_change=reset_all)
|
| 401 |
|
| 402 |
+
list_words = paper[column_selected].dropna() # remove NaN
|
| 403 |
+
list_words = [w for w in list_words if str(w).strip() != ""] # remove empty strings
|
| 404 |
+
list_unique = sorted(set(list_words))
|
| 405 |
|
| 406 |
if column_selected is not None:
|
| 407 |
label1 = col2.selectbox(
|
|
|
|
| 514 |
|
| 515 |
with tab4:
|
| 516 |
st.subheader(':blue[Result]', anchor=False)
|
| 517 |
+
st.button('📥 Download Graph', on_click="ignore")
|
| 518 |
st.text("Click Download Graph button.")
|
| 519 |
|
| 520 |
st.divider()
|
| 521 |
st.subheader(':blue[Shifterator Dataframe]', anchor=False)
|
| 522 |
+
st.button('📥 Press to download result', on_click="ignore")
|
| 523 |
+
st.text("Click the Download button to get the CSV result.")
|
| 524 |
|
| 525 |
except:
|
| 526 |
st.error("Please ensure that your file is correct. Please contact us if you find that this is an error.", icon="🚨")
|