1

The Quizlet API has been dead for a while but I want to work on a project that uses their study sets. I spent a while trying to find an API to use but there were many misleading claims; some say the old API works but you need an access token (which there is no way to generate) and some say it doesn't work. I figured out that I'll need to do the extra work without an API and create a backend that gets the HTML from Quizlet and gets the cards from that. I did look for packages such as quizlet-fetc and quizlet-fetcher but neither of them worked. This has been a lot more difficult than expected because Quizlet seems to be blocking requests, and Request failed with status code 403 keeps getting logged when I make a request. Below is the code I have so far and I would appreciate any advice.

const express = require('express');
const app = express();
const axios = require('axios');
const cheerio = require('cheerio');
const https = require('https');
const httpProxy = require('http-proxy');

const proxy = httpProxy.createProxyServer();

// Proxy server
proxy.on('error', function(err, req, res) {
  console.error(err);
  res.writeHead(500, {
    'Content-Type': 'text/plain'
  });
  res.end('Error');
});

// Allow all origins to access this API
app.use(function(req, res, next) {
  res.header('Access-Control-Allow-Origin', '*');
  res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE');
  res.header('Access-Control-Allow-Headers', 'Content-Type');
  next();
});

// Fetch the HTML for a Quizlet set
const fetchQuizletSet = async (url) => {
  const config = {
    headers: {
      'Content-Type': 'text/html',
    },
    httpsAgent: new https.Agent({
      rejectUnauthorized: false
    })
  };
  const response = await axios.get(url, config);
  return response.data;
};

// Parse the HTML for a Quizlet set and extract the card data
const parseQuizletSet = (html) => {
  const $ = cheerio.load(html);
  const title = $("h1.PageTitle-heading").text().trim();
  const cards = [];
  $(".SetPageTerms-term").each((index, element) => {
    const front = $(element).find(".SetPageTerm-wordText").text().trim();
    const back = $(element).find(".SetPageTerm-definitionText").text().trim();
    const image = $(element).find(".SetPageTerm-media img").attr("src");
    const audio = $(element)
      .find(".SetPageTerm-media audio source")
      .attr("src");
    cards.push({ front, back, image, audio });
  });
  return { title, cards };
};

// Define a route to handle Quizlet set requests
app.get('/quizlet-set/:setId', async (req, res) => {
  const setId = req.params.setId;
  const url = `https://quizlet.com/${setId}`;
  try {
    const html = await fetchQuizletSet(url);
    const data = parseQuizletSet(html);
    res.json(data);
  } catch (error) {
    console.log(error);
    res.status(500).send(error.message);
  }
});

// Start the server
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
  console.log(`Server listening on port ${PORT}`);
});

1 Answer 1

1

Axios in the code you posted fails getting past cloudflare and so just gives the error Request failed with status code 403. It can be solved using puppeteer-extra like in the code below.

package.json : add the following

"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2"

code : sample output

const express = require('express');
const app = express();
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());

// Allow all origins to access this API
app.use(function(req, res, next) {
  res.header('Access-Control-Allow-Origin', '*');
  res.header('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE');
  res.header('Access-Control-Allow-Headers', 'Content-Type');
  next();
});

// Fetch the HTML for a Quizlet set
const getQuizletSet = async (url) => {
  const browser = await puppeteer.launch({
    headless: true,
    args: [
      '--no-sandbox',
      '--disable-setuid-sandbox',
      '--disable-blink-features=AutomationControlled',
      '--window-size=1920,2700',
      '--lang=en-US,en;q=0.9'
    ]
  });
  const page = await browser.newPage();
  await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36');
  await page.goto(url, {
    waitUntil: 'networkidle2',
    timeout: 30000
  });
  await page.waitForSelector('script[id=__NEXT_DATA__]');

  let raw = await page.$eval('script[id=__NEXT_DATA__]', el => el.textContent);
  let parsed = JSON.parse(raw); // check this or dehydratedReduxStateKey below if you need anything more from the page
  parsed = parsed.props.pageProps;

  let result = null;

  try {
    const { setTitle, canonicalUrl, socialImageUrl, dehydratedReduxStateKey } = JSON.parse(raw).props.pageProps;
    const terms = Object.values(JSON.parse(dehydratedReduxStateKey).setPage.termIdToTermsMap);
    const cards = terms.map(({
      word: front,
      _wordAudioUrl: frontAudio,
      definition: back,
      _definitionAudioUrl: backAudio,
      _imageUrl: image
    }) => ({
      front,
      frontAudio,
      back: back.replace(/[\r\n]/gm, ''),  // remove line breaks
      backAudio,
      image
    }));

    result = ({ url: canonicalUrl, socialImg: socialImageUrl, title: setTitle, cards: cards });

  } catch (error) {
    console.error(error);
  }

  browser.close();
  return result;
};

// Define a route to handle Quizlet set requests
app.get('/quizlet-set/:setId', async (req, res) => {
  const setId = req.params.setId;
  const url = `https://quizlet.com/${setId}`;
  try {
    const data = await getQuizletSet(url);
    res.setHeader('Cache-Control', 'public, max-age=0');
    res.json(data);
  } catch (error) {
    console.log(error);
    res.status(500).send(error.message);
  }
});

// Start the server
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
  console.log(`Server listening on port ${PORT}`);
});
5
  • I just tried your code and it worked once. Now when I try fetching from the server, the error An error occurred trying to load the resource. in the Network tab. The server console shows no errors. I'm not sure why it worked once then stopped.
    – Globe
    Commented May 3, 2023 at 0:00
  • @Globe I just tried 10 different queries 688027875, 665940037, 752373053, 730733603, 687402392, 693576267, 656579725, 784293906,778748788, 780114243 at once to check for problems, but all returned results without an error, so not sure what problem you are getting is, you can change headless from true to false in the code to see what is going on as it'll open the page in chromium.
    – idchi
    Commented May 3, 2023 at 6:50
  • non existing values return Waiting failed: 30000ms exceeded as the __NEXT_DATA__ script doesn't exist on the 'Hmm, looks like you're studying old notes...' page that shows up, and passing nothing returns Cannot GET /quizlet-set/ apart from these couldn't get it to give me another error yet.
    – idchi
    Commented May 3, 2023 at 16:16
  • 1
    I found that the issue only occurs when trying to use the same set multiple times. That's why your code worked once and only once, I was using the same setId. The way to fix that was to add the Cache'Cache-Control', 'public, max-age=0' header.
    – Globe
    Commented May 3, 2023 at 23:42
  • Tried the same set/setid multiple times and still don't get an error... which is interesting but if cache-control max-age=0 fixes it in your case then all is OK, I've updated the code as suggested.
    – idchi
    Commented May 4, 2023 at 6:19

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Not the answer you're looking for? Browse other questions tagged or ask your own question.