setName('iaa')->setDescription('Import Annual Abstracts'); $this->addOption('files', null, InputOption::VALUE_NONE, 'Import Files'); return $this; } protected function execute(InputInterface $input, OutputInterface $output) { $this->input = $input; $this->output = $output; $this->getLogger()->setVerbose($this->output->isVerbose()); if ($this->input->getOption('files')) { $this->importFiles(); return $this; } $this->getLogger()->info('Import Started'); $pages = $this->getConfig('pages', []); foreach ($pages as $url) { $this->importPage($url); } $this->getLogger()->info('Import Finished'); return $this; } protected function importPage($url) { $matches = []; preg_match('/([\d]+)/', $url, $matches); $year = $matches[1]; $this->getLogger()->setPrefix(''); $this->getLogger()->info('Loading page ' . $url); $html = $this->getPageHtml($url); $crawler = new Crawler($html); $results = $crawler->filter($this->getConfig('items')); $count = $results->count(); $this->getLogger()->info('Found {count} items', compact('count')); /** * @var DOMElement $row */ $results->each(function (Crawler $node, $index) use ($year) { $this->getLogger()->setPrefix('Row #' . ($index + 1) . '. '); if (($entity = $this->parseEntity($node))) { $entity['year'] = $year; $entity['sorter'] = $index; $this->importItem($entity); } }); return $this; } protected function importItem(array $item) { $this->getModel()->insert($item); return $this; } protected function parseEntity(Crawler $node) { $entity = []; foreach ($this->getConfig('entity') as $name => $options) { $selector = $options['selector']; $last = false; if (false !== strpos($selector, ':last')) { $selector = str_replace(':last', '', $selector); $last = true; } $results = $node->filter($selector); if (!$results->count()) { $this->getLogger()->error('node ' . $options['selector'] . ' not found'); return false; } $element = $last ? $results->last() : $results->first(); switch ($options['source']) { case 'text': $value = $element->text(); break; case 'attribute': $value = $element->attr($options['attributeName']); break; default: throw new Exception('Unsupported source: ' . $options['source']); } $entity[$name] = trim($value); } $entity['file'] = ltrim($entity['file'], '/'); return $entity; } protected function getPageHtml($url) { $this->getHttpClient()->setUri($url)->setMethod(HttpRequest::METHOD_GET); $response = $this->getHttpClient()->send(); if ($response->getStatusCode() == HttpResponse::STATUS_CODE_400) { throw new HttpClientRuntimeException($response->getStatusCode() . ' ' . $response->getReasonPhrase()); } return $response->getBody(); } protected function importFiles() { $this->getLogger()->info('Files Import Started'); $files = $this->getModel()->getFiles(); $tempFile = $this->getConfig('tempPath') . '/annual-abstract-file-list.txt'; $this->createFileList($files, $tempFile); $this->downloadFileList($tempFile); unlink($tempFile); $this->getLogger()->info('Files Import Finished'); } protected function createFileList(array $files, $file) { if (false === ($handle = fopen($file, 'w+'))) { throw new Exception('Failed to create file: ' . $file); } $baseUrl = $this->getConfig('baseUrl'); foreach ($files as $name) { $url = $baseUrl . '/' . $name; ; $item = $url . PHP_EOL . ' out=' . $name . PHP_EOL; if (false === fwrite($handle, $item)) { throw new Exception('Failed to write file: ' . $file); } } fclose($handle); return $file; } protected function downloadFileList($file) { if (null === ($aria2c = shell_exec('which aria2c'))) { throw new Exception('aria2c is not installed'); } $cmd = trim($aria2c) . ' ' . '-i ' . escapeshellarg($file) . ' ' . '-d ' . escapeshellarg($this->getConfig('fileRoot')) . ' ' . '-j 25 ' . '--remote-time=true ' . '--allow-overwrite=true ' . '--conditional-get=true ' . '--auto-file-renaming=false '; if ($this->output->isDebug()) { $logFile = str_replace('.txt', '.log', $file); $cmd .= ' >' . escapeshellarg($logFile) . ' 2>&1'; } else { $cmd .= ' -q'; } $output = []; $return = 0; exec($cmd, $output, $return); if (0 != $return) { $this->getLogger()->warning('Some files were not downloaded. Aria2 error number: ' . $return); } return $this; } public function getHttpClient() { if (null === $this->httpClient) { $this->setHttpClient(new HttpClient()); } return $this->httpClient; } public function setHttpClient(HttpClient $httpClient) { $this->httpClient = $httpClient; return $this; } public function getModel() { return $this->model; } public function setModel($model) { $this->model = $model; return $this; } }