diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/README.md b/crawler4j-examples/crawler4j-examples-spring-boot/README.md new file mode 100644 index 000000000..76703c099 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/README.md @@ -0,0 +1,50 @@ +# Crawler4j Spring boot integration example + +On popular demand here and example of integration with Spring Boot. It has been designed to be the most idiomatic as possible +(`@Service`, `@Async`, hibernate-validator etc). + +Run it: + +```bash +$ mvn spring-boot:run -Dspring.profiles.active=dev +``` + +Submit a crawler request: + +```bash + curl -X POST --header 'Content-Type: application/json' --header 'Accept: application/json' -d '{ + "url": "http://example.com", + "callback": "" +}' "http://localhost:8080/api/v1/crawl" +``` + +- callback: you can specify URL to callback when the crawling is done. The call will POST the response (see later) to the +specified callback URL. + +The response is similar to: + +```json5 +{ + "id":1, + "url":"http://example.com", + "callback":"", + "started":"2018-12-15T08:09:40.436Z", + "status":"ACCEPTED", +} +``` + +Monitor the status of a crawl request: + +```bash +curl -X GET -v "http://localhost:8080/api/v1/crawl/1" +``` + +```json5 +{ + "id":1, + "url":"http://example.com", + "callback":"", + "started":"2018-12-14T19:59:22.665Z", + "status":"DONE", +} +``` diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/pom.xml b/crawler4j-examples/crawler4j-examples-spring-boot/pom.xml new file mode 100644 index 000000000..e42ffb215 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/pom.xml @@ -0,0 +1,148 @@ + + + + crawler4j-parent + edu.uci.ics + 4.5.0-SNAPSHOT + ../../pom.xml + + ${project.groupId}:${project.artifactId} + Open Source Web Crawler for Java - example with Spring Boot + https://github.com/yasserg/crawler4j + crawler4j-examples-spring-boot + 4.0.0 + + + 2.1.0.RELEASE + + + + + edu.uci.ics + crawler4j + ${project.parent.version} + + + org.springframework.boot + spring-boot-starter-web + + + org.springframework.boot + spring-boot-starter-data-jpa + + + org.springframework.boot + spring-boot-devtools + true + + + com.h2database + h2 + runtime + + + + io.springfox + springfox-swagger2 + 2.9.2 + + + io.springfox + springfox-swagger-ui + 2.9.2 + + + io.springfox + springfox-bean-validators + 2.9.2 + + + + org.modelmapper + modelmapper + 2.3.0 + + + joda-time + joda-time + 2.10.1 + + + com.fasterxml.jackson.datatype + jackson-datatype-joda + 2.9.6 + + + + org.jadira.usertype + usertype.core + 6.0.1.GA + + + org.eclipse.jetty + jetty-client + 9.4.12.v20180830 + + + javax.validation + validation-api + 2.0.0.Final + + + org.hibernate.validator + hibernate-validator + 6.0.2.Final + + + org.hibernate.validator + hibernate-validator-annotation-processor + 6.0.2.Final + + + org.projectlombok + lombok + 1.18.4 + + + + + + + org.springframework.boot + spring-boot-maven-plugin + ${spring.boot.version} + + + org.apache.maven.plugins + maven-resources-plugin + 2.7 + + + @ + + false + + + + + + src/main/resources + true + + + + + + + + org.springframework.boot + spring-boot-dependencies + ${spring.boot.version} + pom + import + + + + \ No newline at end of file diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/Application.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/Application.java new file mode 100644 index 000000000..2d5f8b2c7 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/Application.java @@ -0,0 +1,68 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring; + +import edu.uci.ics.crawler4j.crawler.CrawlConfig; +import org.apache.commons.lang3.RandomStringUtils; +import org.eclipse.jetty.client.HttpClient; +import org.modelmapper.ModelMapper; +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Scope; +import org.springframework.data.jpa.repository.config.EnableJpaRepositories; +import org.springframework.scheduling.annotation.EnableAsync; + +import java.nio.file.Paths; + +@EnableJpaRepositories +@SpringBootApplication +@EnableAsync +public class Application { + + public static void main(String[] args) { + SpringApplication.run(Application.class, args); + } + + @Bean + @Scope("prototype") // Because at this point crawler4j does not guarantee CrawlConfig to be thread safe. + public CrawlConfig crawlerConfiguration() { + // See CrawlConfig for all configuration options. + CrawlConfig config = new CrawlConfig(); + config.setCrawlStorageFolder(Paths.get(System.getProperty("java.io.tmpdir"), "crawler", RandomStringUtils.randomAlphabetic(5)).toString()); + config.setIncludeBinaryContentInCrawling(false); + config.setProcessBinaryContentInCrawling(false); + config.setResumableCrawling(false); + config.setMaxDownloadSize(20 * 1024 * 1024); + + return config; + } + + @Bean + public ModelMapper modelMapper() { + return new ModelMapper(); + } + + @Bean(destroyMethod = "stop") + public HttpClient httpClient() throws Exception { + HttpClient client = new HttpClient(); + client.setFollowRedirects(true); + client.start(); + return client; + } + +} \ No newline at end of file diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/SpringWebCrawler.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/SpringWebCrawler.java new file mode 100644 index 000000000..edd584c66 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/SpringWebCrawler.java @@ -0,0 +1,77 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring; + +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.crawler.WebCrawler; +import edu.uci.ics.crawler4j.examples.spring.entity.CrawlRequest; +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerRequestModel; +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerStatus; +import edu.uci.ics.crawler4j.examples.spring.repo.CrawlerRequestRepository; +import edu.uci.ics.crawler4j.url.WebURL; +import org.apache.commons.lang3.StringUtils; +import org.joda.time.DateTime; +import org.modelmapper.ModelMapper; + +import java.net.URI; + +public class SpringWebCrawler extends WebCrawler { + + private final CrawlerRequestModel request; + private final CrawlerRequestRepository repo; + private final ModelMapper modelMapper; + + public SpringWebCrawler(CrawlerRequestModel request, CrawlerRequestRepository repo, ModelMapper modelMapper) { + this.request = request; + this.repo = repo; + this.modelMapper = modelMapper; + } + + @Override + public void onStart() { + request.setStatus(CrawlerStatus.WORKING); + repo.save(modelMapper.map(request, CrawlRequest.class)); + } + + public boolean shouldVisit(Page referringPage, WebURL url) { + final URI uri; + try { + uri = URI.create(url.getURL()); + } catch (IllegalArgumentException e) { + logger.warn("Illegal url {}", url.getURL()); + return false; + } + if (StringUtils.startsWithIgnoreCase(referringPage.getWebURL().getURL(), uri.getScheme() + "://" + uri.getHost()) || + StringUtils.startsWithIgnoreCase(referringPage.getWebURL().getURL(), "https://" + uri.getHost())) { + return true; + } + + logger.debug("Ignoring " + url); + return false; + } + @Override + public void onBeforeExit() { + request.setStatus(CrawlerStatus.DONE); + request.setFinished(DateTime.now()); + repo.save(modelMapper.map(request, CrawlRequest.class)); + } + + @Override + protected boolean shouldFollowLinksIn(WebURL url) { + return false; + } +} diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/SwaggerConfig.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/SwaggerConfig.java new file mode 100644 index 000000000..b5d33e078 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/SwaggerConfig.java @@ -0,0 +1,63 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring; + +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import springfox.documentation.builders.ApiInfoBuilder; +import springfox.documentation.builders.PathSelectors; +import springfox.documentation.builders.RequestHandlerSelectors; +import springfox.documentation.service.ApiInfo; +import springfox.documentation.service.Contact; +import springfox.documentation.spi.DocumentationType; +import springfox.documentation.spring.web.plugins.Docket; +import springfox.documentation.swagger.web.UiConfiguration; +import springfox.documentation.swagger.web.UiConfigurationBuilder; +import springfox.documentation.swagger2.annotations.EnableSwagger2; + +@Configuration +@EnableSwagger2 +public class SwaggerConfig { + + @Value("${crawler4j.version}") + public String projectVersion; + + @Bean + public Docket api() { + return new Docket(DocumentationType.SWAGGER_2) + .select() + .apis(RequestHandlerSelectors.basePackage("edu.uci.ics.crawler4j.examples.spring")) + .paths(PathSelectors.any()) + .build().apiInfo(apiEndPointsInfo()).enableUrlTemplating(true); + } + + @Bean + UiConfiguration uiConfig() { + return UiConfigurationBuilder.builder() + .displayRequestDuration(true) + .validatorUrl("") + .build(); + } + + private ApiInfo apiEndPointsInfo() { + return new ApiInfoBuilder().title("crawler4j-example-spring-boot") + .description("Example spring-boot application for crawler4j") + .version(projectVersion) + .build(); + } +} diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/controller/CrawlerController.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/controller/CrawlerController.java new file mode 100644 index 000000000..9bb536dda --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/controller/CrawlerController.java @@ -0,0 +1,100 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.controller; + +import edu.uci.ics.crawler4j.examples.spring.entity.CrawlRequest; +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerRequestModel; +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerStatus; +import edu.uci.ics.crawler4j.examples.spring.repo.CrawlerRequestRepository; +import edu.uci.ics.crawler4j.examples.spring.service.CrawlerService; +import io.swagger.annotations.Api; +import io.swagger.annotations.ApiOperation; +import io.swagger.annotations.ApiResponse; +import io.swagger.annotations.ApiResponses; +import lombok.extern.slf4j.Slf4j; +import org.joda.time.DateTime; +import org.modelmapper.ModelMapper; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; + +import java.util.Optional; + +@Slf4j +@Api(tags = { "crawl" }) +@RequestMapping("/api/v1") +@RestController +public class CrawlerController { + + @Autowired + CrawlerService service; + + @Autowired + CrawlerRequestRepository crawlerRequestRepository; + + @Autowired + ModelMapper modelMapper; + + @ApiOperation(value = "Submit a crawl request.", response = CrawlerRequestModel.class, nickname = "submitCrawlRequest") + @ApiResponses(value = { + @ApiResponse(code = 202, message = "Crawl request has been accepted."), + @ApiResponse(code = 500, message = "Crawl request has not been accepted.") + }) + @RequestMapping(path = "/crawl", method = RequestMethod.POST, consumes = "application/json", produces = "application/json") + public ResponseEntity index(@RequestBody CrawlerRequestModel request) { + final CrawlRequest run = modelMapper.map(request, CrawlRequest.class); + run.setStarted(DateTime.now()); + run.setStatus(CrawlerStatus.ACCEPTED); + final CrawlRequest saved = crawlerRequestRepository.save(run); + + final CrawlerRequestModel savedModel = modelMapper.map(saved, CrawlerRequestModel.class); + + try { + service.crawl(savedModel, 2); + return ResponseEntity.status(HttpStatus.ACCEPTED).body(savedModel); + } catch (Exception e) { + log.error("Crawler not started.", e); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(CrawlerRequestModel.EMPTY); + } + } + + @ApiOperation(value = "Status of a crawl request.", + response = CrawlerRequestModel.class, + nickname = "statusCrawlRequest") + @ApiResponses(value = { + @ApiResponse(code = 200, message = "Crawl request exists. See the 'status' field of the model response " + + "to get the status of the crawling process."), + @ApiResponse(code = 404, message = "Crawl request does not exist."), + }) + @RequestMapping(path = "/crawl/{id}", method = RequestMethod.GET, consumes = "application/json", produces = "application/json") + public ResponseEntity status(@PathVariable(name="id") Long requestId) { + + final Optional byId = crawlerRequestRepository.findById(requestId); + + if ( ! byId.isPresent()) { + return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); + } + + if ( byId.get().getStatus().equals(CrawlerStatus.WORKING) ) { + return ResponseEntity.ok(modelMapper.map(byId.get(), CrawlerRequestModel.class)); + } + + return ResponseEntity.ok(modelMapper.map(byId.get(), CrawlerRequestModel.class)); + } + +} \ No newline at end of file diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/entity/CrawlRequest.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/entity/CrawlRequest.java new file mode 100644 index 000000000..0bdc5e64b --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/entity/CrawlRequest.java @@ -0,0 +1,55 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.entity; + +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerStatus; +import lombok.Data; +import org.hibernate.validator.constraints.URL; +import org.joda.time.DateTime; + +import javax.persistence.*; + +@Data +@Entity +@Table(name = "crawl_request") +public class CrawlRequest { + @Id + @GeneratedValue + @Column(name="id") + private Long id; + + @URL + private String url; + + @URL + private String callback; + + private DateTime started = DateTime.now(); + + private DateTime finished; + + @Enumerated(EnumType.STRING) + private CrawlerStatus status; + + public CrawlRequest() { + } + + public CrawlRequest(Long id) { + super(); + this.id = id; + } +} \ No newline at end of file diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/model/CrawlerRequestModel.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/model/CrawlerRequestModel.java new file mode 100644 index 000000000..daacbb862 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/model/CrawlerRequestModel.java @@ -0,0 +1,64 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.model; + +import com.fasterxml.jackson.annotation.JsonInclude; +import io.swagger.annotations.ApiModel; +import io.swagger.annotations.ApiModelProperty; +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.apache.commons.lang3.StringUtils; +import org.hibernate.validator.constraints.URL; +import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormat; + +@Data +@AllArgsConstructor +@NoArgsConstructor +@JsonInclude(JsonInclude.Include.NON_NULL) +@ApiModel(value="CrawlerRequestModel", description="A simple crawling request.") +public class CrawlerRequestModel { + + @ApiModelProperty(value = "Id of the crawling request.") + private Long id; + + @ApiModelProperty(value = "Url to crawl.") + @URL + private String url; + + @ApiModelProperty(value = "Callback URL to call at end of the crawling.") + @URL + private String callback; + + private DateTime started; + + private DateTime finished; + + @ApiModelProperty(value = "Status of the crawling request.") + private CrawlerStatus status; + + public static final CrawlerRequestModel EMPTY = + new CrawlerRequestModel( + -1L + , StringUtils.EMPTY + , StringUtils.EMPTY + , DateTime.parse("1970-01-01 00:00", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm")) + , DateTime.parse("1970-01-01 00:00", DateTimeFormat.forPattern("yyyy-MM-dd HH:mm")) + , CrawlerStatus.NONE + ); +} diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/model/CrawlerStatus.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/model/CrawlerStatus.java new file mode 100644 index 000000000..0c096c496 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/model/CrawlerStatus.java @@ -0,0 +1,34 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.model; + +import io.swagger.annotations.ApiModel; +import io.swagger.annotations.ApiModelProperty; + +@ApiModel +public enum CrawlerStatus { + @ApiModelProperty(value = "The request has been accepted and the crawler has started.") + ACCEPTED, + @ApiModelProperty(value = "The crawler is still working.") + WORKING, + @ApiModelProperty(value = "The crawler died for unexpected error.") + ERROR, + @ApiModelProperty(hidden = true) + NONE, + @ApiModelProperty(value = "The crawler process finished. This does not make assumption over a successful state.") + DONE +} \ No newline at end of file diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/repo/CrawlerRequestRepository.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/repo/CrawlerRequestRepository.java new file mode 100644 index 000000000..d54b8bfba --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/repo/CrawlerRequestRepository.java @@ -0,0 +1,26 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.repo; + +import edu.uci.ics.crawler4j.examples.spring.entity.CrawlRequest; +import org.springframework.data.jpa.repository.JpaRepository; + +import javax.transaction.Transactional; + +@Transactional +public interface CrawlerRequestRepository extends JpaRepository { +} \ No newline at end of file diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/service/CrawlerFactory.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/service/CrawlerFactory.java new file mode 100644 index 000000000..ad562672e --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/service/CrawlerFactory.java @@ -0,0 +1,48 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.service; + +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.crawler.WebCrawler; +import edu.uci.ics.crawler4j.examples.spring.SpringWebCrawler; +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerRequestModel; +import edu.uci.ics.crawler4j.examples.spring.repo.CrawlerRequestRepository; +import org.modelmapper.ModelMapper; +import org.springframework.stereotype.Service; + +public class CrawlerFactory implements CrawlController.WebCrawlerFactory { + + private final CrawlerRequestModel request; + private final CrawlerRequestRepository repo; + private final ModelMapper modelMapper; + + public CrawlerFactory( + CrawlerRequestModel request + , CrawlerRequestRepository crawlerRequestRepository, ModelMapper modelMapper) { + this.request = request; + this.repo = crawlerRequestRepository; + this.modelMapper = modelMapper; + } + + @Override + public WebCrawler newInstance() throws Exception { + WebCrawler webCrawler = new SpringWebCrawler(request, repo, modelMapper); + + return webCrawler; + } + +} diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/service/CrawlerService.java b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/service/CrawlerService.java new file mode 100644 index 000000000..7cabcc649 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/java/edu/uci/ics/crawler4j/examples/spring/service/CrawlerService.java @@ -0,0 +1,79 @@ +/* + * Copyright 2018 Federico Tolomei + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.examples.spring.service; + +import static org.apache.commons.io.FileUtils.deleteQuietly; +import edu.uci.ics.crawler4j.crawler.CrawlConfig; +import edu.uci.ics.crawler4j.crawler.CrawlController; +import edu.uci.ics.crawler4j.examples.spring.model.CrawlerRequestModel; +import edu.uci.ics.crawler4j.examples.spring.repo.CrawlerRequestRepository; +import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; +import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; +import lombok.extern.slf4j.Slf4j; +import org.modelmapper.ModelMapper; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.scheduling.annotation.Async; +import org.springframework.stereotype.Service; + +import java.io.File; + +@Slf4j +@Service +public class CrawlerService { + + @Autowired + CrawlConfig config; + + @Autowired + CrawlerRequestRepository crawlerRequestRepository; + + @Autowired + ModelMapper modelMapper; + + @Async + public void crawl(CrawlerRequestModel request, int numCrawlers) throws Exception { + /* + * Instantiate the controller for this crawl. + */ + PageFetcher pageFetcher = new PageFetcher(config); + + RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); + // If you want ignore robots.txt + robotstxtConfig.setEnabled(false); + RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); + CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); + controller.addSeed(request.getUrl()); + + CrawlerFactory factory = new CrawlerFactory(request, crawlerRequestRepository, modelMapper); + + controller.startNonBlocking(factory, numCrawlers); + + while (!controller.isFinished()) { + Thread.sleep(10_000); + + log.info("Waiting for crawler request {} to finish." + , request.getId()); + } + + log.info("Crawling done for {}", 10); + + if ( ! deleteQuietly(new File(config.getCrawlStorageFolder())) ) { + log.warn("Something wrong deleting {}" + config.getCrawlStorageFolder()); + } + } +} diff --git a/crawler4j-examples/crawler4j-examples-spring-boot/src/main/resources/application.properties b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/resources/application.properties new file mode 100644 index 000000000..cc44e1142 --- /dev/null +++ b/crawler4j-examples/crawler4j-examples-spring-boot/src/main/resources/application.properties @@ -0,0 +1,2 @@ +spring.jpa.properties.jadira.usertype.autoRegisterUserTypes = true +crawler4j.version=@project.parent.version@ diff --git a/pom.xml b/pom.xml index 63080e618..218cd27c2 100644 --- a/pom.xml +++ b/pom.xml @@ -51,6 +51,7 @@ crawler4j crawler4j-examples/crawler4j-examples-base crawler4j-examples/crawler4j-examples-postgres + crawler4j-examples/crawler4j-examples-spring-boot