Skip to content

Commit

Permalink
Refactor and add new edge case code
Browse files Browse the repository at this point in the history
  • Loading branch information
mfisherlevine committed Nov 27, 2024
1 parent 61d5af2 commit 980cfe3
Showing 1 changed file with 118 additions and 57 deletions.
175 changes: 118 additions & 57 deletions python/lsst/summit/extras/soarScraping.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,10 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

import io
import logging
import os
import re
import shutil
import tempfile
import time
import traceback
from dataclasses import dataclass
Expand All @@ -39,6 +38,9 @@
# Check Pillow version to determine the correct resampling filter
from PIL import Image, ImageEnhance
from PIL import __version__ as PILLOW_VERSION
from requests.exceptions import HTTPError

from lsst.summit.utils.utils import getSunAngle

if version.parse(PILLOW_VERSION) >= version.parse("9.1.0"):
resample_filter = Image.LANCZOS
Expand Down Expand Up @@ -80,6 +82,12 @@ def getSeeingAtTime(self, time):
def getSeeingForDataId(self, dataId):
raise NotImplementedError

def plotTodaysSeeing(self):
raise NotImplementedError

def plotSeeingForDayObs(self, dayObs):
raise NotImplementedError


class SoarDatabaseBuiler:
STORE_FILE = "seeing_conditions.h5"
Expand All @@ -95,52 +103,79 @@ def __init__(self):
with pd.HDFStore(self.STORE_FILE, mode="w") as store: # noqa: F841
pass # Create an empty store

self.last_etag = None
self.lastModified = None

def getCurrentSeeingFromWebsite(self):
with tempfile.NamedTemporaryFile() as temp_file:
headers = {
"User-Agent": "Mozilla/5.0 (compatible; SoarSeeingMonitor/1.0; +http://tellmeyourseeing.com/bot)"
}

# Add conditional headers if we have previous values
if self.last_etag:
headers["If-None-Match"] = self.last_etag
if self.lastModified:
headers["If-Modified-Since"] = self.lastModified

try:
with requests.get(SOAR_IMAGE_URL, stream=True) as response:
if response.status_code == 304:
# Resource has not changed; no need to process
print("Image has not changed since the last check.")
return None

response.raise_for_status()
for chunk in response.iter_content(chunk_size=8192):
temp_file.write(chunk)

temp_file.flush()
temp_file.seek(0)
self.last_etag = response.headers.get("ETag")
self.lastModified = response.headers.get("Last-Modified")

if "image" not in response.headers.get("Content-Type", ""):
raise ValueError("URL did not return an image.")

imageData = response.content
seeingConditions = self.getSeeingConditionsFromBytes(imageData)

try:
seeingConditions = self.getSeeingConditionsFromFile(temp_file.name)
return seeingConditions
except Exception:
with open(self.ERROR_FILE, "a") as f:
f.write(f"Exception at {datetime.now()}:\n")
traceback.print_exc(file=f)
# copy file to error directory
filename = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
shutil.copy(temp_file.name, os.path.join(self.FAILED_FILES_DIR, filename + ".png"))

except HTTPError as httpErr:
print(f"HTTP error occurred: {httpErr}")
# Handle HTTP errors (e.g., 403 Forbidden)
# You can implement backoff strategies or logging here
return None
except Exception as e:
# Log the exception
print(f"An error occurred: {e}")
with open(self.ERROR_FILE, "a") as f:
f.write(f"Exception at {datetime.now()}:\n")
traceback.print_exc(file=f)
if "imageData" in locals():
filename = datetime.now().strftime("%Y-%m-%d_%H:%M:%S") + ".png"
failedImagePath = os.path.join(self.FAILED_FILES_DIR, filename)
with open(failedImagePath, "wb") as f:
f.write(imageData)
return None

@staticmethod
def adjust_coords(coords, image_height):
def adjustCoords(coords, height):
(x0, y0), (x1, y1) = coords
# Adjust y coordinates
y0_new = image_height - y0
y1_new = image_height - y1
y0_new = height - y0
y1_new = height - y1
# Ensure that upper < lower for the crop function
upper = min(y0_new, y1_new)
lower = max(y0_new, y1_new)
return (x0, upper, x1, lower)

def getSeeingConditionsFromFile(self, filename):
input_image = Image.open(filename)
image_width, image_height = input_image.size # Get image dimensions
coordinates = {
name: self.adjust_coords(coords, image_height) for name, coords in USER_COORDINATES.items()
}
def getSeeingConditionsFromBytes(self, imageData):
inputImage = Image.open(io.BytesIO(imageData))
_, height = inputImage.size # Get image dimensions
coordinates = {name: self.adjustCoords(coords, height) for name, coords in USER_COORDINATES.items()}

# Crop and store sub-images in variables
dateImage = input_image.crop(coordinates["dateImage"])
seeingImage = input_image.crop(coordinates["seeingImage"])
freeAtmSeeingImage = input_image.crop(coordinates["freeAtmSeeingImage"])
groundLayerImage = input_image.crop(coordinates["groundLayerImage"])
dateImage = inputImage.crop(coordinates["dateImage"])
seeingImage = inputImage.crop(coordinates["seeingImage"])
freeAtmSeeingImage = inputImage.crop(coordinates["freeAtmSeeingImage"])
groundLayerImage = inputImage.crop(coordinates["groundLayerImage"])

date = self._getDateTime(dateImage)
seeing = self._getSeeingNumber(seeingImage)
Expand All @@ -149,7 +184,7 @@ def getSeeingConditionsFromFile(self, filename):

return SeeingConditions(date, seeing, freeAtmSeeing, groundLayer)

def get_last_timestamp(self):
def getLastTimestamp(self):
"""Retrieve the last timestamp from the HDFStore."""
with pd.HDFStore(self.STORE_FILE, mode="r") as store:
if "/data" in store.keys():
Expand All @@ -159,68 +194,94 @@ def get_last_timestamp(self):
return None

def run(self):
last_timestamp = self.get_last_timestamp()
lastTimestamp = self.getLastTimestamp()
while True:
# Fetch the current seeing conditions
if getSunAngle() > -2:
print("Sun is too high, waiting for the SOAR seeing monitor to have a chance...")
time.sleep(60)
continue

print("Fetching the current seeing conditions... ", end="")
seeing = self.getCurrentSeeingFromWebsite()
if seeing is None:
print("Something went wrong in the data scraping - check the error logs.")
time.sleep(30)
continue
new_timestamp = seeing.timestamp
newTimestamp = seeing.timestamp

# Check if the new timestamp is newer than the last recorded one
if last_timestamp is None or new_timestamp > last_timestamp:
if lastTimestamp is None or newTimestamp > lastTimestamp:
# Create a DataFrame for the new data
new_data = pd.DataFrame(
print(f'seeing = {seeing.seeing}" @ {newTimestamp} UTC')
df = pd.DataFrame(
{
"seeing": [seeing.seeing],
"freeAtmSeeing": [seeing.freeAtmSeeing],
"groundLayer": [seeing.groundLayer],
},
index=[new_timestamp],
index=[newTimestamp],
)

# Append the new data to the HDFStore
with pd.HDFStore(self.STORE_FILE, mode="a") as store:
store.append("data", new_data, format="table", data_columns=True)
store.append("data", df, format="table", data_columns=True)

last_timestamp = new_timestamp
lastTimestamp = newTimestamp
else:
print("no updates since last time.")

time.sleep(30)

def _getDateTime(self, dateImage):
dateImage_np = np.array(dateImage)
results = self.reader.readtext(dateImage_np, detail=0)
date_string = " ".join(results).strip()
date_string = date_string.replace(".", ":") # Replace dots with colons for parsing
dateImage = self._preprocessImage(dateImage)
# dateImage_np = np.array(dateImage)
results = self.reader.readtext(dateImage, detail=0)
dateString = " ".join(results).strip()
# replace common OCR errors
dateString = dateString.replace(".", ":")
dateString = dateString.replace(";", ":")
dateString = dateString.replace("o", "0")
dateString = dateString.replace("O", "0")
dateString = dateString.replace("i", "1")
dateString = dateString.replace("l", "1")
dateString = dateString.replace("I", "1")
dateString = dateString.replace("::", ":")
dateString = dateString.replace("*", ":")
dateString = dateString.replace(" :", ":")

try:
date_obj = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")
date_obj = datetime.strptime(dateString, "%Y-%m-%d %H:%M:%S")
except ValueError:
# If the format doesn't match, try alternative formats
# For example, if there's an underscore instead of a space
date_obj = datetime.strptime(date_string, "%Y-%m-%d_%H:%M:%S")
date_obj = datetime.strptime(dateString, "%Y-%m-%d_%H:%M:%S")

return date_obj

def _getSeeingNumber(self, image):
scale_factor = 4 # You can adjust this factor as needed
new_size = (image.width * scale_factor, image.height * scale_factor)
image = image.resize(new_size, resample=resample_filter)
@staticmethod
def thresholdImage(image, threshold):
return image.point(lambda x: 0 if x < threshold else 255, "L")

def _preprocessImage(self, image):
scaleFactor = 4 # You can adjust this factor as needed
newSize = (image.width * scaleFactor, image.height * scaleFactor)
image = image.resize(newSize, resample=resample_filter)

image_gray = image.convert("L")
imageGrayscale = image.convert("L")

enhancer = ImageEnhance.Contrast(image_gray)
seeingImage_contrast = enhancer.enhance(2.0) # Increase contrast by a factor of 2
enhancer = ImageEnhance.Contrast(imageGrayscale)
imageGrayContrastEnhanced = enhancer.enhance(2.0) # Increase contrast by a factor of 2

threshold = 128 # You can adjust this threshold as needed
threshold = 128

def threshold_image(image, threshold):
return image.point(lambda x: 0 if x < threshold else 255, "L")
imageBw = self.thresholdImage(imageGrayContrastEnhanced, threshold)
imageBwNumpy = np.array(imageBw).astype(np.uint8)
return imageBwNumpy

def _getSeeingNumber(self, image):
image = self._preprocessImage(image)

seeingImage_bw = threshold_image(seeingImage_contrast, threshold)
seeingImage_np = np.array(seeingImage_bw).astype(np.uint8)
results = self.reader.readtext(seeingImage_np, detail=0, contrast_ths=0.1, adjust_contrast=0.5)
results = self.reader.readtext(image, detail=0, contrast_ths=0.1, adjust_contrast=0.5)
seeing_text = " ".join(results).strip()

match = re.search(r"[-+]?\d*\.\d+|\d+", seeing_text)
Expand Down

0 comments on commit 980cfe3

Please sign in to comment.