Update app.py
Browse files
app.py
CHANGED
|
@@ -230,9 +230,9 @@ def load_embedding_model(model_name="mixedbread-ai/mxbai-embed-large-v1"):
|
|
| 230 |
return tokenizer, model
|
| 231 |
|
| 232 |
@st.cache_data
|
| 233 |
-
def generate_embeddings(
|
| 234 |
"""Generates embeddings for a list of text entries."""
|
| 235 |
-
encoded_input =
|
| 236 |
text_list, padding=True, truncation=True, return_tensors="pt"
|
| 237 |
)
|
| 238 |
with torch.no_grad():
|
|
@@ -256,7 +256,7 @@ def main():
|
|
| 256 |
st.markdown(
|
| 257 |
"""
|
| 258 |
**General Usage Guide**
|
| 259 |
-
|
| 260 |
* Both tools work best with larger datasets (hundreds or thousands of entries).
|
| 261 |
* For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
|
| 262 |
* Output files are compressed to 16 dimensions.
|
|
@@ -270,7 +270,7 @@ def main():
|
|
| 270 |
st.header("Compress Your Embeddings")
|
| 271 |
st.markdown(
|
| 272 |
"""
|
| 273 |
-
Upload a CSV file containing pre-existing embeddings.
|
| 274 |
This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
|
| 275 |
"""
|
| 276 |
)
|
|
@@ -311,7 +311,10 @@ def main():
|
|
| 311 |
help="Enter each text entry on a new line. This tool works best with a large sample size.",
|
| 312 |
)
|
| 313 |
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
| 315 |
text_list = text_input.strip().split("\n")
|
| 316 |
if len(text_list) == 0:
|
| 317 |
st.warning("Please enter some text for embedding")
|
|
@@ -319,7 +322,7 @@ def main():
|
|
| 319 |
try:
|
| 320 |
with st.spinner("Generating and compressing embeddings..."):
|
| 321 |
tokenizer, model = load_embedding_model()
|
| 322 |
-
embeddings = generate_embeddings(
|
| 323 |
compressor = veczip(target_dims=16)
|
| 324 |
retained_indices = compressor.compress(embeddings)
|
| 325 |
compressed_embeddings = embeddings[:, retained_indices]
|
|
|
|
| 230 |
return tokenizer, model
|
| 231 |
|
| 232 |
@st.cache_data
|
| 233 |
+
def generate_embeddings(_tokenizer, model, text_list):
|
| 234 |
"""Generates embeddings for a list of text entries."""
|
| 235 |
+
encoded_input = _tokenizer(
|
| 236 |
text_list, padding=True, truncation=True, return_tensors="pt"
|
| 237 |
)
|
| 238 |
with torch.no_grad():
|
|
|
|
| 256 |
st.markdown(
|
| 257 |
"""
|
| 258 |
**General Usage Guide**
|
| 259 |
+
|
| 260 |
* Both tools work best with larger datasets (hundreds or thousands of entries).
|
| 261 |
* For CSV files with embeddings, ensure that numeric embedding columns are parsed as arrays (e.g. '[1,2,3]' or '1,2,3') and metadata columns are parsed as text or numbers.
|
| 262 |
* Output files are compressed to 16 dimensions.
|
|
|
|
| 270 |
st.header("Compress Your Embeddings")
|
| 271 |
st.markdown(
|
| 272 |
"""
|
| 273 |
+
Upload a CSV file containing pre-existing embeddings.
|
| 274 |
This will reduce the dimensionality of the embeddings to 16 dimensions using `dejan.veczip`.
|
| 275 |
"""
|
| 276 |
)
|
|
|
|
| 311 |
help="Enter each text entry on a new line. This tool works best with a large sample size.",
|
| 312 |
)
|
| 313 |
|
| 314 |
+
generate_button = st.button("Generate and Compress")
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
if generate_button and text_input:
|
| 318 |
text_list = text_input.strip().split("\n")
|
| 319 |
if len(text_list) == 0:
|
| 320 |
st.warning("Please enter some text for embedding")
|
|
|
|
| 322 |
try:
|
| 323 |
with st.spinner("Generating and compressing embeddings..."):
|
| 324 |
tokenizer, model = load_embedding_model()
|
| 325 |
+
embeddings = generate_embeddings(tokenizer, model, text_list)
|
| 326 |
compressor = veczip(target_dims=16)
|
| 327 |
retained_indices = compressor.compress(embeddings)
|
| 328 |
compressed_embeddings = embeddings[:, retained_indices]
|