# Step 1: Extract text from the PDF with fitz.open('Single-cell RNA counting at allele and isoform resolution using Smart-seq3.pdf') as pdffile: numpages = pdffile.page_count for pagenum inrange(numpages): page = pdffile[pagenum] pagetext = page.get_text() context += pagetext
# Define the text splitting function defsplittext(text, chunksize=5000): from io import StringIO # You'll need this import for StringIO from nltk.tokenize import sent_tokenize # Assuming you're using nltk for sentence tokenization
# Define the GPT-3 completion function defgpt3completion(prompt, engine='text-davinci-003', temp=0.5, tokens=1000): prompt = prompt.encode(encoding='ASCII', errors='ignore').decode() try: response = openai.Completion.create( engine=engine, prompt=prompt, temperature=temp, max_tokens=tokens ) time.sleep(10) # Wait for 10 seconds between requests to avoid rate limits return response.choices[0].text.strip() except Exception as oops: return"GPT-3 error: %s" % oops
# Step 2: Split the text into chunks and use GPT-3 for summarization # chunks = splittext(context) # summaries = [gpt3completion("Summarize the following text: " + chunk) for chunk in chunks]
# # Step 3: Combine all the summaries for the final abstract # final_summary = " ".join(summaries)