You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
File ~\anaconda3\Lib\site-packages\unstructured\chunking\dispatch.py:74, in add_chunking_strategy..wrapper(*args, **kwargs)
71 """The decorated function is replaced with this one."""
73 # -- call the partitioning function to get the elements --
---> 74 elements = func(*args, **kwargs)
76 # -- look for a chunking-strategy argument --
77 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
File ~\anaconda3\Lib\site-packages\unstructured\partition\pdf.py:920, in _process_uncategorized_text_elements(elements)
918 for el in elements:
919 if hasattr(el, "category") and el.category == ElementType.UNCATEGORIZED_TEXT:
--> 920 new_el = element_from_text(cast(Text, el).text)
921 new_el.metadata = el.metadata
922 else:
File ~\anaconda3\Lib\site-packages\unstructured\partition\text_type.py:80, in is_possible_narrative_text(text, cap_threshold, non_alpha_threshold, languages, language_checks)
75 # NOTE(robinson): it gets read in from the environment as a string so we need to
76 # cast it to a float
77 cap_threshold = float(
78 os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
79 )
---> 80 if exceeds_cap_ratio(text, threshold=cap_threshold):
81 trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") # type: ignore # noqa: E501
82 return False
File ~\anaconda3\Lib\site-packages\unstructured\partition\text_type.py:276, in exceeds_cap_ratio(text, threshold)
263 """Checks the title ratio in a section of text. If a sufficient proportion of the words
264 are capitalized, that can be indicated on non-narrative text (i.e. "1A. Risk Factors").
265
(...)
272 the function returns True
273 """
274 # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
275 # The assumption is that sections with multiple sentences are not titles.
--> 276 if sentence_count(text, 3) > 1:
277 return False
279 if text.isupper():
File ~\anaconda3\Lib\site-packages\unstructured\partition\text_type.py:225, in sentence_count(text, min_length)
214 def sentence_count(text: str, min_length: Optional[int] = None) -> int:
215 """Checks the sentence count for a section of text. Titles should not be more than one
216 sentence.
217
(...)
223 The min number of words a section needs to be for it to be considered a sentence.
224 """
--> 225 sentences = sent_tokenize(text)
226 count = 0
227 for sentence in sentences:
File ~\anaconda3\Lib\site-packages\unstructured\nlp\tokenize.py:136, in sent_tokenize(text)
133 @lru_cache(maxsize=CACHE_MAX_SIZE)
134 def sent_tokenize(text: str) -> List[str]:
135 """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
--> 136 _download_nltk_packages_if_not_present()
137 return _sent_tokenize(text)
File ~\anaconda3\Lib\site-packages\unstructured\nlp\tokenize.py:130, in _download_nltk_packages_if_not_present()
125 tokenizer_available = check_for_nltk_package(
126 package_category="tokenizers", package_name="punkt"
127 )
129 if not (tokenizer_available and tagger_available):
--> 130 download_nltk_packages()
File ~\anaconda3\Lib\site-packages\unstructured\nlp\tokenize.py:88, in download_nltk_packages()
86 with tempfile.NamedTemporaryFile() as tmp_file:
87 tgz_file = tmp_file.name
---> 88 urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file)
90 file_hash = sha256_checksum(tgz_file)
91 if file_hash != NLTK_DATA_SHA256:
Describe the bug
PermissionError: [Errno 13] Permission denied: 'C:\conda_tmp\tmpmssmgevw'
PermissionError Traceback (most recent call last)
Cell In[5], line 13
11 if local_path:
12 loader = UnstructuredPDFLoader(file_path=local_path)
---> 13 data = loader.load()
14 else:
15 print("Upload a PDF file")
File ~\anaconda3\Lib\site-packages\langchain_core\document_loaders\base.py:30, in BaseLoader.load(self)
28 def load(self) -> List[Document]:
29 """Load data into Document objects."""
---> 30 return list(self.lazy_load())
File ~\anaconda3\Lib\site-packages\langchain_community\document_loaders\unstructured.py:89, in UnstructuredBaseLoader.lazy_load(self)
87 def lazy_load(self) -> Iterator[Document]:
88 """Load file."""
---> 89 elements = self._get_elements()
90 self._post_process_elements(elements)
91 if self.mode == "elements":
File ~\anaconda3\Lib\site-packages\langchain_community\document_loaders\pdf.py:73, in UnstructuredPDFLoader._get_elements(self)
70 def _get_elements(self) -> List:
71 from unstructured.partition.pdf import partition_pdf
---> 73 return partition_pdf(filename=self.file_path, **self.unstructured_kwargs)
File ~\anaconda3\Lib\site-packages\unstructured\documents\elements.py:593, in process_metadata..decorator..wrapper(*args, **kwargs)
591 @functools.wraps(func)
592 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> list[Element]:
--> 593 elements = func(*args, **kwargs)
594 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
596 regex_metadata: dict["str", "str"] = call_args.get("regex_metadata", {})
File ~\anaconda3\Lib\site-packages\unstructured\file_utils\filetype.py:626, in add_filetype..decorator..wrapper(*args, **kwargs)
624 @functools.wraps(func)
625 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 626 elements = func(*args, **kwargs)
627 params = get_call_args_applying_defaults(func, *args, **kwargs)
628 include_metadata = params.get("include_metadata", True)
File ~\anaconda3\Lib\site-packages\unstructured\file_utils\filetype.py:582, in add_metadata..wrapper(*args, **kwargs)
580 @functools.wraps(func)
581 def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> List[Element]:
--> 582 elements = func(*args, **kwargs)
583 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
584 include_metadata = call_args.get("include_metadata", True)
File ~\anaconda3\Lib\site-packages\unstructured\chunking\dispatch.py:74, in add_chunking_strategy..wrapper(*args, **kwargs)
71 """The decorated function is replaced with this one."""
73 # -- call the partitioning function to get the elements --
---> 74 elements = func(*args, **kwargs)
76 # -- look for a chunking-strategy argument --
77 call_args = get_call_args_applying_defaults(func, *args, **kwargs)
File ~\anaconda3\Lib\site-packages\unstructured\partition\pdf.py:202, in partition_pdf(filename, file, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, include_metadata, metadata_filename, metadata_last_modified, chunking_strategy, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, date_from_file_object, starting_page_number, extract_forms, form_extraction_skip_tables, **kwargs)
198 exactly_one(filename=filename, file=file)
200 languages = check_language_args(languages or [], ocr_languages) or ["eng"]
--> 202 return partition_pdf_or_image(
203 filename=filename,
204 file=file,
205 include_page_breaks=include_page_breaks,
206 strategy=strategy,
207 infer_table_structure=infer_table_structure,
208 languages=languages,
209 metadata_last_modified=metadata_last_modified,
210 hi_res_model_name=hi_res_model_name,
211 extract_images_in_pdf=extract_images_in_pdf,
212 extract_image_block_types=extract_image_block_types,
213 extract_image_block_output_dir=extract_image_block_output_dir,
214 extract_image_block_to_payload=extract_image_block_to_payload,
215 date_from_file_object=date_from_file_object,
216 starting_page_number=starting_page_number,
217 extract_forms=extract_forms,
218 form_extraction_skip_tables=form_extraction_skip_tables,
219 **kwargs,
220 )
File ~\anaconda3\Lib\site-packages\unstructured\partition\pdf.py:341, in partition_pdf_or_image(filename, file, is_image, include_page_breaks, strategy, infer_table_structure, ocr_languages, languages, metadata_last_modified, hi_res_model_name, extract_images_in_pdf, extract_image_block_types, extract_image_block_output_dir, extract_image_block_to_payload, date_from_file_object, starting_page_number, extract_forms, form_extraction_skip_tables, **kwargs)
330 with warnings.catch_warnings():
331 elements = _partition_pdf_or_image_with_ocr(
332 filename=filename,
333 file=file,
(...)
339 **kwargs,
340 )
--> 341 out_elements = _process_uncategorized_text_elements(elements)
343 return out_elements
File ~\anaconda3\Lib\site-packages\unstructured\partition\pdf.py:920, in _process_uncategorized_text_elements(elements)
918 for el in elements:
919 if hasattr(el, "category") and el.category == ElementType.UNCATEGORIZED_TEXT:
--> 920 new_el = element_from_text(cast(Text, el).text)
921 new_el.metadata = el.metadata
922 else:
File ~\anaconda3\Lib\site-packages\unstructured\partition\text.py:294, in element_from_text(text, coordinates, coordinate_system)
288 elif is_possible_numbered_list(text):
289 return ListItem(
290 text=text,
291 coordinates=coordinates,
292 coordinate_system=coordinate_system,
293 )
--> 294 elif is_possible_narrative_text(text):
295 return NarrativeText(
296 text=text,
297 coordinates=coordinates,
298 coordinate_system=coordinate_system,
299 )
300 elif is_possible_title(text):
File ~\anaconda3\Lib\site-packages\unstructured\partition\text_type.py:80, in is_possible_narrative_text(text, cap_threshold, non_alpha_threshold, languages, language_checks)
75 # NOTE(robinson): it gets read in from the environment as a string so we need to
76 # cast it to a float
77 cap_threshold = float(
78 os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold),
79 )
---> 80 if exceeds_cap_ratio(text, threshold=cap_threshold):
81 trace_logger.detail(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}") # type: ignore # noqa: E501
82 return False
File ~\anaconda3\Lib\site-packages\unstructured\partition\text_type.py:276, in exceeds_cap_ratio(text, threshold)
263 """Checks the title ratio in a section of text. If a sufficient proportion of the words
264 are capitalized, that can be indicated on non-narrative text (i.e. "1A. Risk Factors").
265
(...)
272 the function returns True
273 """
274 # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
275 # The assumption is that sections with multiple sentences are not titles.
--> 276 if sentence_count(text, 3) > 1:
277 return False
279 if text.isupper():
File ~\anaconda3\Lib\site-packages\unstructured\partition\text_type.py:225, in sentence_count(text, min_length)
214 def sentence_count(text: str, min_length: Optional[int] = None) -> int:
215 """Checks the sentence count for a section of text. Titles should not be more than one
216 sentence.
217
(...)
223 The min number of words a section needs to be for it to be considered a sentence.
224 """
--> 225 sentences = sent_tokenize(text)
226 count = 0
227 for sentence in sentences:
File ~\anaconda3\Lib\site-packages\unstructured\nlp\tokenize.py:136, in sent_tokenize(text)
133 @lru_cache(maxsize=CACHE_MAX_SIZE)
134 def sent_tokenize(text: str) -> List[str]:
135 """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
--> 136 _download_nltk_packages_if_not_present()
137 return _sent_tokenize(text)
File ~\anaconda3\Lib\site-packages\unstructured\nlp\tokenize.py:130, in _download_nltk_packages_if_not_present()
125 tokenizer_available = check_for_nltk_package(
126 package_category="tokenizers", package_name="punkt"
127 )
129 if not (tokenizer_available and tagger_available):
--> 130 download_nltk_packages()
File ~\anaconda3\Lib\site-packages\unstructured\nlp\tokenize.py:88, in download_nltk_packages()
86 with tempfile.NamedTemporaryFile() as tmp_file:
87 tgz_file = tmp_file.name
---> 88 urllib.request.urlretrieve(NLTK_DATA_URL, tgz_file)
90 file_hash = sha256_checksum(tgz_file)
91 if file_hash != NLTK_DATA_SHA256:
File ~\anaconda3\Lib\urllib\request.py:251, in urlretrieve(url, filename, reporthook, data)
249 # Handle temporary file setup.
250 if filename:
--> 251 tfp = open(filename, 'wb')
252 else:
253 tfp = tempfile.NamedTemporaryFile(delete=False)
PermissionError: [Errno 13] Permission denied: 'C:\conda_tmp\tmpmssmgevw'
A clear and concise description of what the bug is.
Used Below code:-
import os
!echo %TESSDATA_PREFIX%
!echo %TMPDIR%
os.environ['TESSDATA_PREFIX'] = r'C:\Users\Rohit\anaconda3\envs\ocr_env\share\tessdata'
local_path = "WEF_The_Global_Cooperation_Barometer_2024.pdf"
os.environ['TMPDIR'] = r'C:\Users\Aditi Rohit\AppData\Local\Temp'
!echo %TESSDATA_PREFIX%
!echo %TMPDIR%
Local PDF file uploads
if local_path:
loader = UnstructuredPDFLoader(file_path=local_path)
data = loader.load()
else:
print("Upload a PDF file")
The text was updated successfully, but these errors were encountered: