File size: 21,157 Bytes

821537b

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "460d90da-b986-4c1c-8a66-eab144b0ba8d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Started Fetching Data\n",
      "Failed to fetch data, retrying. Attempt 1/10\n",
      "Failed to fetch data, retrying. Attempt 1/10\n",
      "Fetched data for all the Pages.\n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import time\n",
    "\n",
    "import random\n",
    "pages = [\n",
    "            random.randint(1, 968000015)\n",
    "            for _ in range(500)\n",
    "        ]\n",
    "# print(pages)\n",
    "\n",
    "base_url = \"https://datasets-server.huggingface.co/rows\"\n",
    "params = {\n",
    "            \"dataset\": \"tiiuae/falcon-refinedweb\",\n",
    "            \"config\": \"default\",\n",
    "            \"split\": \"train\",\n",
    "        }\n",
    "# response = requests.get(base_url, params=params)\n",
    "# response.raise_for_status()\n",
    "# for row in response.json()[\"rows\"]:\n",
    "#   content = row[\"row\"][\"content\"]\n",
    "num_rows_per_page = 100\n",
    "retry_limit = 10\n",
    "retry_delay = 5\n",
    "Falcon = []\n",
    "\n",
    "print('Started Fetching Data')\n",
    "def fetch_data_for_page(page):\n",
    "        params[\"offset\"] = page\n",
    "        params[\"limit\"] = num_rows_per_page\n",
    "        attempt = 0\n",
    "        while attempt < retry_limit:\n",
    "            try:\n",
    "                response = requests.get(base_url, params=params)\n",
    "                response.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code\n",
    "                for row in response.json()[\"rows\"]:\n",
    "                    content = row[\"row\"][\"content\"]\n",
    "                    Falcon.append(content)\n",
    "                len(Falcon)\n",
    "                #print(f\"Fetched data for all the Pages.\")\n",
    "                break\n",
    "            except requests.exceptions.HTTPError as e:\n",
    "                attempt += 1\n",
    "                print(\n",
    "                    f\"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}\"\n",
    "                )\n",
    "                if attempt < retry_limit:\n",
    "                    time.sleep(retry_delay)  # Wait before the next retry\n",
    "                else:\n",
    "                    print(\n",
    "                        \"Maximum retry limit reached. Unable to fetch data.\"\n",
    "                    )\n",
    "                    raise\n",
    "\n",
    "for page in pages:\n",
    "  fetch_data_for_page(page)\n",
    "\n",
    "print(f\"Fetched data for all the Pages.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "f8f3baf1-5480-450b-a456-174a5c114d3e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "# Open the CSV file for writing\n",
    "with open(\"FalconData2.csv\", \"w\", newline=\"\") as csvfile:\n",
    "    # Create a CSV writer object\n",
    "    writer = csv.writer(csvfile)\n",
    "\n",
    "    # Write the header row\n",
    "    writer.writerow([\"Text\"])\n",
    "\n",
    "    # Write each element in the list as a row in the CSV file\n",
    "    for element in Falcon:\n",
    "        writer.writerow([element])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ea47c936-2c2b-4414-ba57-74fb6827ec0a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of duplicate rows: 5\n",
      "                                                   Text\n",
      "522                                               Name:\n",
      "11746  Description.\\nReviews\\nThere are no reviews yet.\n",
      "17606  Description.\\nReviews\\nThere are no reviews yet.\n",
      "30436                                               NaN\n",
      "42549                                               !\\n\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Read the CSV file into a pandas DataFrame\n",
    "df = pd.read_csv(\"FalconData2.csv\")\n",
    "\n",
    "# Check for duplicate rows\n",
    "duplicate_rows = df[df.duplicated()]\n",
    "\n",
    "# Print the number of duplicate rows\n",
    "print(f\"Number of duplicate rows: {len(duplicate_rows)}\")\n",
    "\n",
    "# Print the duplicate rows\n",
    "print(duplicate_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "f4178cd6-747f-4e05-a9bf-17b97f959e06",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[…]\\nM&amp;S bank […]\\nLowest unsecured loan rate...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>JavaScript seems to be disabled in your browse...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CMTech has designed a game to foster social in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A Storyteller's Point of View\\nMy\\nWriting\\nLe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mspu.us was registered 1 decade 3 years ago. I...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Text\n",
       "0   […]\\nM&S bank […]\\nLowest unsecured loan rate...\n",
       "1  JavaScript seems to be disabled in your browse...\n",
       "2  CMTech has designed a game to foster social in...\n",
       "3  A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n",
       "4  mspu.us was registered 1 decade 3 years ago. I..."
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "264548c1-4cf4-441f-a433-2f5d57861dc4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>This.\\n51.351813 -105.220438\\n12 replies on “L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>VIDEO 1: Panel discussion with John Nichols, a...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>The Prototype DA-2A made its first flight on M...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>default search action\\nBibTeX record journals/...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text\n",
       "49995  Alumni in Action: Grace Heyne Lybrand\\nWhen Gr...\n",
       "49996  This.\\n51.351813 -105.220438\\n12 replies on “L...\n",
       "49997  VIDEO 1: Panel discussion with John Nichols, a...\n",
       "49998  The Prototype DA-2A made its first flight on M...\n",
       "49999  default search action\\nBibTeX record journals/..."
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "3f215b09-8050-4477-860c-d3ed0a19f45d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Words:\n",
      "0         65\n",
      "1         79\n",
      "2        287\n",
      "3        302\n",
      "4        130\n",
      "        ... \n",
      "49995     64\n",
      "49996    325\n",
      "49997     58\n",
      "49998    623\n",
      "49999     67\n",
      "Name: Text, Length: 50000, dtype: int64\n",
      "Smallest Row:\n",
      "Text    This\n",
      "Name: 270, dtype: object\n",
      "\n",
      "Largest Row:\n",
      "Text    MAMMALS\\n400. Abu Jafar, M.Z., and C. Hays-Sha...\n",
      "Name: 33020, dtype: object\n"
     ]
    }
   ],
   "source": [
    "# Calculate the word count for each row without storing it as a column\n",
    "word_counts = df['Text'].apply(lambda x: len(str(x).split()))\n",
    "\n",
    "\n",
    "print(\"Number of Words:\")\n",
    "print(word_counts)\n",
    "\n",
    "# print(\"Smallest Count\")\n",
    "# print(word_counts.min())\n",
    "\n",
    "# print(\"Largest Count\")\n",
    "# print(word_counts.max())\n",
    "\n",
    "# Find the row with the smallest word count\n",
    "smallest_row = df.loc[word_counts.idxmin()]\n",
    "\n",
    "# Find the row with the largest word count\n",
    "largest_row = df.loc[word_counts.idxmax()]\n",
    "\n",
    "# Display the smallest and largest rows\n",
    "print(\"Smallest Row:\")\n",
    "print(smallest_row)\n",
    "\n",
    "print(\"\\nLargest Row:\")\n",
    "print(largest_row)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "be5a87a8-cfee-4f63-992e-8fa1d4a5cdbb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text    NaN\n",
       "Name: 30436, dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_row=30436\n",
    "specific_row = df.iloc[target_row]\n",
    "specific_row"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "e97d9e18-eaa0-4a1b-96ab-c89a0f4c738d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Text    The old wireline Bell telephone system was bui...\n",
      "Name: 19995, dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(specific_row)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "940ef35f-7517-403d-9f42-73760182dcaa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Text    The old wireline Bell telephone system was bui...\n"
     ]
    }
   ],
   "source": [
    "print(specific_row.to_string())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "915ac669-718f-47f5-b175-a5f928b407db",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "57\n"
     ]
    }
   ],
   "source": [
    "print(len(specific_row.to_string()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "ab5ee254-9ba7-496b-97c7-3b6185c21971",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training set size: 49000\n",
      "Validation set size: 1000\n"
     ]
    }
   ],
   "source": [
    "# import pandas as pd\n",
    "\n",
    "# # Load the dataset\n",
    "# df = pd.read_csv(\"FalconData2.csv\")\n",
    "\n",
    "# # Calculate the index to split the data at the last 10%\n",
    "# split_index = int(len(df) * 0.980)\n",
    "\n",
    "# # Split the data into training and validation sets\n",
    "# train_df = df.iloc[:split_index]  # First 90% for training\n",
    "# validation_df = df.iloc[split_index:]  # Last 10% for validation\n",
    "\n",
    "# # Display the sizes of the training and validation sets\n",
    "# print(f\"Training set size: {len(train_df)}\")\n",
    "# print(f\"Validation set size: {len(validation_df)}\")\n",
    "\n",
    "# # Optionally, save the datasets to new CSV files\n",
    "# train_df.to_csv(\"FalconData_train2.csv\", index=False)\n",
    "# validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7a16fb10-40cd-4668-b363-57ca64819ad3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of rows removed due to NaN values: 2\n",
      "Training set size: 48998\n",
      "Validation set size: 1000\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# Load the dataset\n",
    "df = pd.read_csv(\"FalconData2.csv\")\n",
    "\n",
    "# Check for NaN values and remove rows with NaN values\n",
    "# df = df.dropna()\n",
    "original_length = len(df)\n",
    "\n",
    "df = df.dropna()\n",
    "\n",
    "removed_rows = original_length - len(df)\n",
    "print(f\"Number of rows removed due to NaN values: {removed_rows}\")\n",
    "\n",
    "# Calculate the index to split the data at the last 2%\n",
    "split_index = int(len(df) * 0.98)\n",
    "\n",
    "# Split the data into training and validation sets\n",
    "train_df = df.iloc[:split_index]  # First 98% for training\n",
    "validation_df = df.iloc[split_index:]  # Last 2% for validation\n",
    "\n",
    "# Display the sizes of the training and validation sets\n",
    "print(f\"Training set size: {len(train_df)}\")\n",
    "print(f\"Validation set size: {len(validation_df)}\")\n",
    "\n",
    "# Save the datasets to new CSV files\n",
    "train_df.to_csv(\"FalconData_train2.csv\", index=False)\n",
    "validation_df.to_csv(\"FalconData_validation2.csv\", index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "55d929c5-c198-4a91-b31d-65dd83fa00d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of duplicate rows: 4\n",
      "                                                   Text\n",
      "522                                               Name:\n",
      "11745  Description.\\nReviews\\nThere are no reviews yet.\n",
      "17605  Description.\\nReviews\\nThere are no reviews yet.\n",
      "42547                                               !\\n\n"
     ]
    }
   ],
   "source": [
    "# Read the CSV file into a pandas DataFrame\n",
    "df1 = pd.read_csv(\"FalconData_train2.csv\")\n",
    "\n",
    "# Check for duplicate rows\n",
    "duplicate_rows1 = df1[df1.duplicated()]\n",
    "\n",
    "# Print the number of duplicate rows\n",
    "print(f\"Number of duplicate rows: {len(duplicate_rows1)}\")\n",
    "\n",
    "# Print the duplicate rows\n",
    "print(duplicate_rows1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3cc404d9-e85e-48ff-aa34-750ebe3e3d3c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>[…]\\nM&amp;S bank […]\\nLowest unsecured loan rate...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>JavaScript seems to be disabled in your browse...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CMTech has designed a game to foster social in...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A Storyteller's Point of View\\nMy\\nWriting\\nLe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>mspu.us was registered 1 decade 3 years ago. I...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                Text\n",
       "0   […]\\nM&S bank […]\\nLowest unsecured loan rate...\n",
       "1  JavaScript seems to be disabled in your browse...\n",
       "2  CMTech has designed a game to foster social in...\n",
       "3  A Storyteller's Point of View\\nMy\\nWriting\\nLe...\n",
       "4  mspu.us was registered 1 decade 3 years ago. I..."
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "641c606f-6f7f-4097-a8de-a9f6be0047b1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>48995</th>\n",
       "      <td>A Chenango County man was charged Wednesday wi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48996</th>\n",
       "      <td>2-Tone Black Personalized Embroidered One Init...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48997</th>\n",
       "      <td>NARAL Pro-Choice America PAC Endorses Colleen ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48998</th>\n",
       "      <td>Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48999</th>\n",
       "      <td>Pantry feeds families in need\\n- Details\\n- Ca...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    Text\n",
       "48995  A Chenango County man was charged Wednesday wi...\n",
       "48996  2-Tone Black Personalized Embroidered One Init...\n",
       "48997  NARAL Pro-Choice America PAC Endorses Colleen ...\n",
       "48998  Posts Tagged by Thomas Paine\\nAEI Hosts Peter ...\n",
       "48999  Pantry feeds families in need\\n- Details\\n- Ca..."
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8f7dbf6-5d74-4f8f-85d0-e890a5b8d152",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}