Skip to content
Snippets Groups Projects
Commit 4564ff13 authored by Alexander König's avatar Alexander König
Browse files

- added robots.txt to nginx container

- sitemaps are now generated in the entrypoint script
parent 29d6bade
No related branches found
No related tags found
No related merge requests found
......@@ -6,10 +6,11 @@ perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/local.prope
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/default-ssl
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/aai.js
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/aai_config.js
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/aai_config.js
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/aa-statistics.php
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/clarin.eurac.edu.template.metadata.xml
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/webpage/index.html
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/webpage/index.html
perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/commul-customization/robots.txt
# adapt URL and prefix for handle server
perl -pi -e 's/20.500.12084/20.500.12124/g' dockerfiles/commul-customization/config.dct
......@@ -22,5 +23,5 @@ perl -pi -e 's/clarin-dev/clarin/g' dockerfiles/docker-compose.yml
echo "Modified all files to point to clarin.eurac.edu instead of clarin-dev.eurac.edu."
# adapt namespace in kubernetes yaml files
sed -i 's/dspace-dev/dspace/' kubernetes/*.yaml
sed -i 's/dspace-dev/dspace/' kubernetes/*.yaml
perl -pi -e 's/clarin-dev/clarin/g' kubernetes/dspace-ingress.yaml
......@@ -49,6 +49,9 @@ RUN chmod a+x /etc/init.d/nginx
RUN ln -s /opt/nginx/sbin/nginx /usr/sbin/nginx
# copy over static html
COPY commul-customization/webpage/ /opt/nginx/html/
# copy over robots.txt
COPY commul-customization/robots.txt /opt/nginx/html/
RUN chown www-data:www-data /opt/nginx/html/robots.txt
#RUN mkdir /opt/nginx/html/img
#COPY commul-customization/index.html /opt/nginx/html/
......
......@@ -145,8 +145,8 @@ server {
}
# remove the robots and favicon from the logs
location ~ /robots.txt$ { access_log off; log_not_found off; }
# location ~ ^/favicon.ico$ { access_log off; log_not_found off; }
# location ~ /robots.txt$ { access_log off; log_not_found off; }
location ~ ^/favicon.ico$ { access_log off; log_not_found off; }
# this prevents hidden files (beginning with a period) from being served
location ~ /\. { access_log off; log_not_found off; deny all; }
......
......@@ -2,7 +2,7 @@
# replace environment variables in config files
perl -pi -e 's/\$\{DSPACE_USER\}/$ENV{DSPACE_USER}/; s/\$\{DSPACE_PASSWORD\}/$ENV{DSPACE_PASSWORD}/; s/\$\{MAIL_USER\}/$ENV{MAIL_USER}/; s/\$\{MAIL_PASSWORD\}/$ENV{MAIL_PASSWORD}/; s/\$\{DSPACE_DBNAME\}/$ENV{DSPACE_DBNAME}/; s/\$\{DSPACE_UTILSDBNAME\}/$ENV{DSPACE_UTILSDBNAME}/;' /opt/repository/sources/dspace/local.properties
perl -pi -e 's/\$\{DSPACE_USER\}/$ENV{DSPACE_USER}/; s/\$\{DSPACE_PASSWORD\}/$ENV{DSPACE_PASSWORD}/; s/\$\{MAIL_USER\}/$ENV{MAIL_USER}/; s/\$\{MAIL_PASSWORD\}/$ENV{MAIL_PASSWORD}/; s/\$\{DSPACE_DBNAME\}/$ENV{DSPACE_DBNAME}/; s/\$\{DSPACE_UTILSDBNAME\}/$ENV{DSPACE_UTILSDBNAME}/;' /opt/repository/sources/dspace/target/local.properties
perl -pi -e 's/\$\{DSPACE_USER\}/$ENV{DSPACE_USER}/; s/\$\{DSPACE_PASSWORD\}/$ENV{DSPACE_PASSWORD}/; s/\$\{MAIL_USER\}/$ENV{MAIL_USER}/; s/\$\{MAIL_PASSWORD\}/$ENV{MAIL_PASSWORD}/; s/\$\{DSPACE_DBNAME\}/$ENV{DSPACE_DBNAME}/; s/\$\{DSPACE_UTILSDBNAME\}/$ENV{DSPACE_UTILSDBNAME}/;' /opt/repository/sources/dspace/target/local.properties
perl -pi -e 's/\$\{DSPACE_USER\}/$ENV{DSPACE_USER}/; s/\$\{DSPACE_PASSWORD\}/$ENV{DSPACE_PASSWORD}/; s/\$\{MAIL_USER\}/$ENV{MAIL_USER}/; s/\$\{MAIL_PASSWORD\}/$ENV{MAIL_PASSWORD}/; s/\$\{DSPACE_DBNAME\}/$ENV{DSPACE_DBNAME}/; s/\$\{DSPACE_UTILSDBNAME\}/$ENV{DSPACE_UTILSDBNAME}/;' /opt/lindat-dspace/installation/config/dspace.cfg
perl -pi -e 's/\$\{DSPACE_USER\}/$ENV{DSPACE_USER}/; s/\$\{DSPACE_PASSWORD\}/$ENV{DSPACE_PASSWORD}/; s/\$\{MAIL_USER\}/$ENV{MAIL_USER}/; s/\$\{MAIL_PASSWORD\}/$ENV{MAIL_PASSWORD}/; s/\$\{DSPACE_DBNAME\}/$ENV{DSPACE_DBNAME}/; s/\$\{DSPACE_UTILSDBNAME\}/$ENV{DSPACE_UTILSDBNAME}/;' /opt/lindat-dspace/installation/config/modules/lr.cfg
......@@ -18,9 +18,8 @@ cd /opt/repository/workspace/scripts
make init_statistics
make update_discovery
make update_oai
/opt/tomcat8/bin/shutdown.sh
/opt/lindat-dspace/installation/bin/dspace generate-sitemaps
/opt/tomcat8/bin/shutdown.sh
# start tomcat
/opt/tomcat8/bin/catalina.sh run
# The FULL URL to the DSpace sitemaps
# The https://clarin-dev.eurac.edu/repository/xmlui will be auto-filled with the value in dspace.cfg
# XML sitemap is listed first as it is preferred by most search engines
Sitemap: https://clarin-dev.eurac.edu/repository/xmlui/sitemap
Sitemap: https://clarin-dev.eurac.edu/repository/xmlui/htmlmap
##########################
# Default Access Group
# (NOTE: blank lines are not allowable in a group record)
##########################
User-agent: *
# Disable access to Discovery search and filters
Disallow: /discover
Disallow: /search-filter
#
# Optionally uncomment the following line ONLY if sitemaps are working
# and you have verified that your site is being indexed correctly.
# Disallow: /browse
#
# If you have configured DSpace (Solr-based) Statistics to be publicly
# accessible, then you may not want this content to be indexed
# Disallow: /statistics
#
# You also may wish to disallow access to the following paths, in order
# to stop web spiders from accessing user-based content
# Disallow: /contact
# Disallow: /feedback
# Disallow: /forgot
# Disallow: /login
# Disallow: /register
##############################
# Section for misbehaving bots
# The following directives to block specific robots were borrowed from Wikipedia's robots.txt
##############################
# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /
# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /
User-agent: DOC
Disallow: /
User-agent: Zao
Disallow: /
# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /
User-agent: Zealbot
Disallow: /
User-agent: MSIECrawler
Disallow: /
User-agent: SiteSnagger
Disallow: /
User-agent: WebStripper
Disallow: /
User-agent: WebCopier
Disallow: /
User-agent: Fetch
Disallow: /
User-agent: Offline Explorer
Disallow: /
User-agent: Teleport
Disallow: /
User-agent: TeleportPro
Disallow: /
User-agent: WebZIP
Disallow: /
User-agent: linko
Disallow: /
User-agent: HTTrack
Disallow: /
User-agent: Microsoft.URL.Control
Disallow: /
User-agent: Xenu
Disallow: /
User-agent: larbin
Disallow: /
User-agent: libwww
Disallow: /
User-agent: ZyBORG
Disallow: /
User-agent: Download Ninja
Disallow: /
# Misbehaving: requests much too fast:
User-agent: fast
Disallow: /
#
# If your DSpace is going down because of someone using recursive wget,
# you can activate the following rule.
#
# If your own faculty is bringing down your dspace with recursive wget,
# you can advise them to use the --wait option to set the delay between hits.
#
#User-agent: wget
#Disallow: /
#
# The 'grub' distributed client has been *very* poorly behaved.
#
User-agent: grub-client
Disallow: /
#
# Doesn't follow robots.txt anyway, but...
#
User-agent: k2spider
Disallow: /
#
# Hits many times per second, not acceptable
# http://www.nameprotect.com/botinfo.html
User-agent: NPBot
Disallow: /
# A capture bot, downloads gazillions of pages with no public benefit
# http://www.webreaper.net/
User-agent: WebReaper
Disallow: /
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment