Spaces:

ml6team
/

keyphrase-extraction

Sleeping

App Files Files Community

Thomas De Decker commited on Jun 16, 2022

Commit

aab5966

1 Parent(s): 8d04b0f

Update description + Fix highlight bugs

Browse files

Files changed (1) hide show

app.py +49 -38

app.py CHANGED Viewed

@@ -17,9 +17,13 @@ def load_pipeline(chosen_model):
         return KeyphraseGenerationPipeline(chosen_model, truncation=True)
 def extract_keyphrases():
     st.session_state.keyphrases = pipe(st.session_state.input_text)
-    st.session_state.history[f"run_{st.session_state.current_run_id}"] = {
         "run_id": st.session_state.current_run_id,
         "model": st.session_state.chosen_model,
         "text": st.session_state.input_text,
@@ -31,7 +35,7 @@ def extract_keyphrases():
 def get_annotated_text(text, keyphrases, color="#d294ff"):
     for keyphrase in keyphrases:
         text = re.sub(
-            rf"({keyphrase})([^A-Za-z])",
             rf"$K:{keyphrases.index(keyphrase)}\2",
             text,
             flags=re.I,
@@ -83,17 +87,6 @@ def render_output(layout, runs, reverse=False):
             unsafe_allow_html=True,
         )
-        if "generation" in run.get("model"):
-            abstractive_keyphrases = [
-                keyphrase
-                for keyphrase in run.get("keyphrases")
-                if keyphrase.lower() not in run.get("text").lower()
-            ]
-            layout.markdown(
-                f"<p style=\"margin-bottom: 0rem\"><strong>Absent keyphrases:</strong> {', '.join(abstractive_keyphrases) if abstractive_keyphrases else 'None' }</p>",
-                unsafe_allow_html=True,
-            )
         result = get_annotated_text(run.get("text"), list(run.get("keyphrases")))
         layout.markdown(
             f"""
@@ -102,6 +95,20 @@ def render_output(layout, runs, reverse=False):
             """,
             unsafe_allow_html=True,
         )
         layout.markdown("---")
@@ -125,32 +132,36 @@ with open("css/style.css") as f:
 st.header("🔑 Keyphrase extraction/generation with Transformers")
 description = """
-Keyphrase extraction is a technique in text analysis where you extract the important keyphrases
-from a text. Since this is a time-consuming process, Artificial Intelligence is used to automate it.
-Currently, classical machine learning methods, that use statistics and linguistics, are widely used
-for the extraction process. The fact that these methods have been widely used in the community has
-the advantage that there are many easy-to-use libraries. Now with the recent innovations in
-NLP, transformers can be used to improve keyphrase extraction. Transformers also focus on the semantics and
-context of a document, which is quite an improvement.
-This space gives you the ability to test around with some keyphrase extraction and generation models.
-Keyphrase extraction models are transformers models fine-tuned as a token classification problem where
-the tokens in a text are annotated as B (Beginning of a keyphrase), I (Inside a keyphrases),
 and O (Outside a keyhprase).
-While keyphrase extraction can only extract keyphrases from a given text. Keyphrase generation models
-work a bit differently. Here you use an encoder-decoder model like BART to generate keyphrases from a given text.
-These models also have the ability to generate keyphrases, which are not present in the text 🤯.
-Do you want to see some magic 🧙‍♂️? Try it out yourself! 👇
 """
 st.write(description)
 with st.form("keyphrase-extraction-form"):
-    selectbox_container, _ = st.columns(2)
-    st.session_state.chosen_model = selectbox_container.selectbox(
         "Choose your model:", st.session_state.config.get("models")
     )
@@ -170,7 +181,8 @@ with st.form("keyphrase-extraction-form"):
     )
     with st.spinner("Extracting keyphrases..."):
-        pressed = st.form_submit_button("Extract")
 if pressed and st.session_state.input_text != "":
     with st.spinner("Loading pipeline..."):
@@ -182,13 +194,12 @@ if pressed and st.session_state.input_text != "":
 elif st.session_state.input_text == "":
     st.error("The text input is empty 🙃 Please provide a text in the input field.")
-options = st.multiselect(
-    "Specify the runs you want to see",
-    st.session_state.history.keys(),
-    format_func=lambda run_id: f"Run {run_id.split('_')[1]}",
-)
 if len(st.session_state.history.keys()) > 0:
     if options:
         render_output(
             st,

         return KeyphraseGenerationPipeline(chosen_model, truncation=True)
+def generate_run_id():
+    return f"run_{re.sub('keyphrase-extraction-|keyphrase-generation-', '', st.session_state.chosen_model)}_{st.session_state.current_run_id}"
 def extract_keyphrases():
     st.session_state.keyphrases = pipe(st.session_state.input_text)
+    st.session_state.history[generate_run_id()] = {
         "run_id": st.session_state.current_run_id,
         "model": st.session_state.chosen_model,
         "text": st.session_state.input_text,
 def get_annotated_text(text, keyphrases, color="#d294ff"):
     for keyphrase in keyphrases:
         text = re.sub(
+            rf"({keyphrase})([^A-Za-z0-9])",
             rf"$K:{keyphrases.index(keyphrase)}\2",
             text,
             flags=re.I,
             unsafe_allow_html=True,
         )
         result = get_annotated_text(run.get("text"), list(run.get("keyphrases")))
         layout.markdown(
             f"""
             """,
             unsafe_allow_html=True,
         )
+        if "generation" in run.get("model"):
+            abstractive_keyphrases = [
+                (keyphrase, "KEY", "#FFA500")
+                for keyphrase in run.get("keyphrases")
+                if keyphrase.lower() not in run.get("text").lower()
+            ]
+            for i in range(len(abstractive_keyphrases)):
+                if i % 2 == 0:
+                    abstractive_keyphrases.insert(i + 1, " ")
+            layout.markdown(
+                f"<p style=\"margin: 1rem 0 0 0\"><strong>Absent keyphrases:</strong> {get_annotated_html(*abstractive_keyphrases) if abstractive_keyphrases else 'None' }</p>",
+                unsafe_allow_html=True,
+            )
         layout.markdown("---")
 st.header("🔑 Keyphrase extraction/generation with Transformers")
 description = """
+Keyphrase extraction is a technique in text analysis where you extract the important keyphrases from a document.
+Thanks to these keyphrases humans can understand the content of a text very quickly and easily without reading
+it completely. Keyphrase extraction was first done primarily by human annotators, who read the text in detail
+and then wrote down the most important keyphrases. The disadvantage is that if you work with a lot of documents,
+this process can take a lot of time ⏳.
+Here is where Artificial Intelligence 🤖 comes in. Currently, classical machine learning methods, that use statistical
+and linguistic features, are widely used for the extraction process. Now with deep learning, it is possible to capture
+the semantic meaning of a text even better than these classical methods. Classical methods look at the frequency,
+occurrence and order of words in the text, whereas these neural approaches can capture long-term semantic dependencies
+and context of words in a text.
+This space gives you the ability to extract keyphrases out of a custom text with transformer-based extraction and generation models.
+Keyphrase extraction models are transformer models fine-tuned as a token classification problem where each word in the document
+is classified as being part of a keyphrase or not.
+The labels used during fine-tuning are B (Beginning of a keyphrase), I (Inside a keyphrases),
 and O (Outside a keyhprase).
+While keyphrase extraction use encoder-only models to interpret the document. Keyphrase generation models
+work a bit differently. Here you use an encoder-decoder model (e.g. BART, T5) to generate keyphrases from a given text.
+These models also have the ability to generate keyphrases, which are not present in the text 🤯.
+This can be really interesting in certain applications. For example if you want to make a news article more discoverable.
+Try it out yourself! 👇
 """
 st.write(description)
 with st.form("keyphrase-extraction-form"):
+    st.session_state.chosen_model = st.selectbox(
         "Choose your model:", st.session_state.config.get("models")
     )
     )
     with st.spinner("Extracting keyphrases..."):
+        _, button_container = st.columns([7, 1])
+        pressed = button_container.form_submit_button("Extract")
 if pressed and st.session_state.input_text != "":
     with st.spinner("Loading pipeline..."):
 elif st.session_state.input_text == "":
     st.error("The text input is empty 🙃 Please provide a text in the input field.")
 if len(st.session_state.history.keys()) > 0:
+    options = st.multiselect(
+        "Specify the runs you want to see",
+        st.session_state.history.keys(),
+        format_func=lambda run_id: f"Run {run_id.split('_')[-1]}: {run_id.split('_')[1]}",
+    )
     if options:
         render_output(
             st,